AI Performance Context Optimization

Efficient Attention Mechanisms for Long-Context Extensions

E
Extendable Team
· 15 min read

Browser extensions often need to process long documents, maintain conversation history, and synthesize information across multiple tabs. Standard attention mechanisms become impractical for these use cases. This guide covers practical techniques for handling long contexts efficiently in your AI-powered extensions.

The Long Context Challenge

Standard transformer attention scales quadratically with sequence length. For a 128K token context window, this means:

  • 128K x 128K = 16 billion attention computations
  • Massive memory requirements
  • Unacceptable latency for interactive use
Context Window Realities:
  • GPT-4 Turbo: 128K tokens (~100 pages)
  • Claude: 200K tokens (~150 pages)
  • Practical limit: 8-16K before quality degrades
  • Cost scales with tokens (input + output)

Chunking Strategies

Semantic Chunking

Split content at meaningful boundaries:

class SemanticChunker {
  constructor(options = {}) {
    this.maxChunkSize = options.maxChunkSize || 1000; // tokens
    this.overlapSize = options.overlapSize || 100;
  }

  chunk(text) {
    // Split by semantic boundaries (paragraphs, sections)
    const sections = this.splitBySections(text);
    const chunks = [];

    for (const section of sections) {
      if (this.estimateTokens(section) <= this.maxChunkSize) {
        chunks.push(section);
      } else {
        // Sub-chunk large sections by paragraphs
        chunks.push(...this.chunkByParagraphs(section));
      }
    }

    return this.addOverlap(chunks);
  }

  splitBySections(text) {
    // Split by headers, horizontal rules, or large gaps
    return text.split(/(?=^#{1,3}\s|\n{3,}|^[-=]{3,}$)/m)
      .filter(s => s.trim());
  }

  chunkByParagraphs(text) {
    const paragraphs = text.split(/\n\n+/);
    const chunks = [];
    let currentChunk = '';

    for (const para of paragraphs) {
      if (this.estimateTokens(currentChunk + para) > this.maxChunkSize) {
        if (currentChunk) chunks.push(currentChunk.trim());
        currentChunk = para;
      } else {
        currentChunk += '\n\n' + para;
      }
    }

    if (currentChunk) chunks.push(currentChunk.trim());
    return chunks;
  }

  addOverlap(chunks) {
    return chunks.map((chunk, i) => {
      let result = chunk;

      // Add overlap from previous chunk
      if (i > 0) {
        const prevOverlap = this.getTrailingText(chunks[i-1], this.overlapSize);
        result = `[...] ${prevOverlap}\n\n${result}`;
      }

      // Add overlap from next chunk
      if (i < chunks.length - 1) {
        const nextOverlap = this.getLeadingText(chunks[i+1], this.overlapSize);
        result = `${result}\n\n${nextOverlap} [...]`;
      }

      return result;
    });
  }

  getTrailingText(text, tokens) {
    const words = text.split(/\s+/);
    return words.slice(-tokens * 0.75).join(' '); // ~0.75 words per token
  }

  getLeadingText(text, tokens) {
    const words = text.split(/\s+/);
    return words.slice(0, tokens * 0.75).join(' ');
  }

  estimateTokens(text) {
    // Rough estimation: 1 token ≈ 4 characters
    return Math.ceil(text.length / 4);
  }
}

Hierarchical Summarization

Create multi-level summaries for very long content:

class HierarchicalSummarizer {
  constructor(aiClient) {
    this.ai = aiClient;
    this.chunkSize = 2000; // tokens
    this.summaryRatio = 0.2; // 20% of original
  }

  async summarize(text) {
    const chunks = this.chunkText(text);

    // Level 1: Chunk summaries
    const chunkSummaries = await Promise.all(
      chunks.map(chunk => this.summarizeChunk(chunk))
    );

    // Level 2: Section summaries (group 5 chunks)
    const sectionSummaries = [];
    for (let i = 0; i < chunkSummaries.length; i += 5) {
      const section = chunkSummaries.slice(i, i + 5).join('\n\n');
      sectionSummaries.push(await this.summarizeChunk(section));
    }

    // Level 3: Document summary
    const documentSummary = await this.summarizeChunk(
      sectionSummaries.join('\n\n')
    );

    return {
      full: text,
      chunks: chunkSummaries,
      sections: sectionSummaries,
      document: documentSummary,
      hierarchy: this.buildHierarchy(chunks, chunkSummaries, sectionSummaries, documentSummary)
    };
  }

  async summarizeChunk(text) {
    const response = await this.ai.query({
      prompt: `Summarize the following text concisely, preserving key facts and relationships:\n\n${text}`,
      maxTokens: Math.ceil(this.estimateTokens(text) * this.summaryRatio)
    });
    return response.text;
  }

  buildHierarchy(chunks, chunkSummaries, sectionSummaries, documentSummary) {
    return {
      type: 'document',
      summary: documentSummary,
      children: sectionSummaries.map((ss, si) => ({
        type: 'section',
        summary: ss,
        children: chunkSummaries.slice(si * 5, (si + 1) * 5).map((cs, ci) => ({
          type: 'chunk',
          summary: cs,
          content: chunks[si * 5 + ci]
        }))
      }))
    };
  }

  chunkText(text) {
    const chunker = new SemanticChunker({ maxChunkSize: this.chunkSize });
    return chunker.chunk(text);
  }

  estimateTokens(text) {
    return Math.ceil(text.length / 4);
  }
}

Sliding Window Attention

Process long content with a moving context window:

class SlidingWindowProcessor {
  constructor(aiClient, windowSize = 4000, strideSize = 3000) {
    this.ai = aiClient;
    this.windowSize = windowSize;
    this.strideSize = strideSize;
  }

  async process(text, query) {
    const windows = this.createWindows(text);
    const results = [];

    for (const window of windows) {
      const result = await this.processWindow(window, query);
      results.push(result);
    }

    return this.mergeResults(results);
  }

  createWindows(text) {
    const tokens = this.tokenize(text);
    const windows = [];

    for (let i = 0; i < tokens.length; i += this.strideSize) {
      windows.push({
        tokens: tokens.slice(i, i + this.windowSize),
        startOffset: i,
        endOffset: Math.min(i + this.windowSize, tokens.length)
      });

      // Stop if we've covered the entire text
      if (i + this.windowSize >= tokens.length) break;
    }

    return windows;
  }

  async processWindow(window, query) {
    const text = window.tokens.join(' ');
    const response = await this.ai.query({
      prompt: `Context:\n${text}\n\nQuery: ${query}\n\nAnswer based only on the context provided:`,
      maxTokens: 500
    });

    return {
      response: response.text,
      startOffset: window.startOffset,
      endOffset: window.endOffset,
      confidence: this.extractConfidence(response)
    };
  }

  mergeResults(results) {
    // Filter out low-confidence results
    const relevant = results.filter(r => r.confidence > 0.3);

    if (relevant.length === 0) {
      return { answer: "No relevant information found.", sources: [] };
    }

    // If multiple relevant results, synthesize
    if (relevant.length > 1) {
      return this.synthesize(relevant);
    }

    return {
      answer: relevant[0].response,
      sources: [relevant[0]]
    };
  }

  async synthesize(results) {
    const synthesis = await this.ai.query({
      prompt: `Synthesize these related findings into a coherent answer:\n\n${
        results.map((r, i) => `Finding ${i+1}:\n${r.response}`).join('\n\n')
      }\n\nSynthesized answer:`,
      maxTokens: 500
    });

    return {
      answer: synthesis.text,
      sources: results
    };
  }

  tokenize(text) {
    return text.split(/\s+/);
  }

  extractConfidence(response) {
    // Heuristic: check for hedging language
    const hedges = ['might', 'possibly', 'unclear', 'not sure', 'cannot find'];
    const hasHedge = hedges.some(h => response.text.toLowerCase().includes(h));
    return hasHedge ? 0.3 : 0.8;
  }
}
Window Sizing Tips:
  • Overlap windows by 20-30% to avoid missing cross-boundary content
  • Use larger windows for complex reasoning tasks
  • Use smaller windows for fact extraction
  • Adjust based on your API's context limit and cost

Sparse Attention Patterns

Implement attention patterns that skip irrelevant content:

class SparseAttentionManager {
  constructor() {
    this.relevanceScores = new Map();
  }

  // Score chunks for relevance to query
  async scoreChunks(chunks, query, embedder) {
    const queryEmbed = await embedder.embed(query);

    const scores = await Promise.all(chunks.map(async (chunk, i) => {
      const chunkEmbed = await embedder.embed(chunk);
      return {
        index: i,
        score: this.cosineSimilarity(queryEmbed, chunkEmbed)
      };
    }));

    return scores.sort((a, b) => b.score - a.score);
  }

  // Select chunks using sparse pattern
  selectChunks(rankedChunks, budget, pattern = 'topk-plus-boundary') {
    const selected = new Set();

    switch (pattern) {
      case 'topk':
        // Just take top K chunks
        rankedChunks.slice(0, budget).forEach(c => selected.add(c.index));
        break;

      case 'topk-plus-boundary':
        // Top K plus first and last chunks for context
        rankedChunks.slice(0, budget - 2).forEach(c => selected.add(c.index));
        selected.add(0); // First chunk
        selected.add(rankedChunks.length - 1); // Last chunk
        break;

      case 'topk-plus-neighbors':
        // Top K plus their neighbors for continuity
        const topK = Math.floor(budget / 2);
        rankedChunks.slice(0, topK).forEach(c => {
          selected.add(c.index);
          if (c.index > 0) selected.add(c.index - 1);
          if (c.index < rankedChunks.length - 1) selected.add(c.index + 1);
        });
        break;

      case 'stratified':
        // Sample from different parts of document
        const topCount = Math.floor(budget * 0.6);
        const stratifiedCount = budget - topCount;

        rankedChunks.slice(0, topCount).forEach(c => selected.add(c.index));

        const stride = Math.floor(rankedChunks.length / stratifiedCount);
        for (let i = 0; i < stratifiedCount; i++) {
          selected.add(i * stride);
        }
        break;
    }

    return Array.from(selected).sort((a, b) => a - b);
  }

  cosineSimilarity(a, b) {
    let dot = 0, normA = 0, normB = 0;
    for (let i = 0; i < a.length; i++) {
      dot += a[i] * b[i];
      normA += a[i] * a[i];
      normB += b[i] * b[i];
    }
    return dot / (Math.sqrt(normA) * Math.sqrt(normB));
  }
}

// Usage example
async function queryLongDocument(document, query, ai, embedder) {
  const chunker = new SemanticChunker({ maxChunkSize: 500 });
  const chunks = chunker.chunk(document);

  const sparseManager = new SparseAttentionManager();
  const ranked = await sparseManager.scoreChunks(chunks, query, embedder);

  // Select top 8 chunks plus boundaries
  const selectedIndices = sparseManager.selectChunks(ranked, 10, 'topk-plus-boundary');
  const selectedChunks = selectedIndices.map(i => chunks[i]);

  // Build context from selected chunks
  const context = selectedChunks.join('\n\n---\n\n');

  const response = await ai.query({
    prompt: `Document excerpts:\n${context}\n\nQuestion: ${query}\n\nAnswer:`,
    maxTokens: 1000
  });

  return response;
}

Memory-Efficient Caching

Cache processed content to avoid re-computation:

class LongContextCache {
  constructor(maxMemoryMB = 50) {
    this.cache = new Map();
    this.maxMemory = maxMemoryMB * 1024 * 1024;
    this.currentMemory = 0;
  }

  async getOrCompute(key, computeFn) {
    if (this.cache.has(key)) {
      const entry = this.cache.get(key);
      entry.lastAccess = Date.now();
      return entry.value;
    }

    const value = await computeFn();
    this.set(key, value);
    return value;
  }

  set(key, value) {
    const size = this.estimateSize(value);

    // Evict if needed
    while (this.currentMemory + size > this.maxMemory && this.cache.size > 0) {
      this.evictLRU();
    }

    this.cache.set(key, {
      value,
      size,
      lastAccess: Date.now()
    });
    this.currentMemory += size;
  }

  evictLRU() {
    let oldest = null;
    let oldestTime = Infinity;

    for (const [key, entry] of this.cache) {
      if (entry.lastAccess < oldestTime) {
        oldest = key;
        oldestTime = entry.lastAccess;
      }
    }

    if (oldest) {
      this.currentMemory -= this.cache.get(oldest).size;
      this.cache.delete(oldest);
    }
  }

  estimateSize(value) {
    return JSON.stringify(value).length * 2; // Rough estimate
  }

  // Cache document hierarchies
  async getCachedHierarchy(url, document, summarizer) {
    const cacheKey = `hierarchy:${url}:${this.hashContent(document)}`;
    return this.getOrCompute(cacheKey, () => summarizer.summarize(document));
  }

  // Cache embeddings
  async getCachedEmbeddings(chunks, embedder) {
    const results = [];
    for (const chunk of chunks) {
      const key = `embed:${this.hashContent(chunk)}`;
      const embedding = await this.getOrCompute(key, () => embedder.embed(chunk));
      results.push(embedding);
    }
    return results;
  }

  hashContent(content) {
    // Simple hash for cache key
    let hash = 0;
    for (let i = 0; i < Math.min(content.length, 1000); i++) {
      hash = ((hash << 5) - hash) + content.charCodeAt(i);
      hash |= 0;
    }
    return hash.toString(36);
  }
}

Conversation Memory

Maintain long conversation context efficiently:

class ConversationMemory {
  constructor(options = {}) {
    this.maxTurns = options.maxTurns || 20;
    this.summaryThreshold = options.summaryThreshold || 10;
    this.turns = [];
    this.summaries = [];
  }

  addTurn(role, content) {
    this.turns.push({ role, content, timestamp: Date.now() });

    // Summarize old turns when threshold reached
    if (this.turns.length > this.summaryThreshold) {
      this.compressOldTurns();
    }
  }

  async compressOldTurns() {
    const toCompress = this.turns.splice(0, this.turns.length - 5);
    const summary = await this.summarizeTurns(toCompress);
    this.summaries.push(summary);
  }

  async summarizeTurns(turns) {
    const transcript = turns.map(t => `${t.role}: ${t.content}`).join('\n');
    // Use AI to summarize
    return `[Previous conversation summary: The user and assistant discussed ${
      this.extractTopics(transcript)
    }. Key points: ${this.extractKeyPoints(transcript)}]`;
  }

  buildContext(maxTokens = 4000) {
    let context = '';
    let tokenCount = 0;

    // Add summaries first
    for (const summary of this.summaries) {
      const tokens = this.estimateTokens(summary);
      if (tokenCount + tokens > maxTokens * 0.3) break;
      context += summary + '\n\n';
      tokenCount += tokens;
    }

    // Add recent turns
    const recentTurns = [...this.turns].reverse();
    for (const turn of recentTurns) {
      const formatted = `${turn.role}: ${turn.content}`;
      const tokens = this.estimateTokens(formatted);

      if (tokenCount + tokens > maxTokens) break;
      context = formatted + '\n' + context;
      tokenCount += tokens;
    }

    return context;
  }

  extractTopics(text) {
    // Simple topic extraction
    const topics = new Set();
    const topicPatterns = [
      /discuss(?:ed|ing)?\s+(\w+(?:\s+\w+)?)/gi,
      /about\s+(\w+(?:\s+\w+)?)/gi,
      /help(?:ed|ing)?\s+with\s+(\w+(?:\s+\w+)?)/gi
    ];

    for (const pattern of topicPatterns) {
      let match;
      while ((match = pattern.exec(text)) !== null) {
        topics.add(match[1].toLowerCase());
      }
    }

    return Array.from(topics).slice(0, 3).join(', ') || 'various topics';
  }

  extractKeyPoints(text) {
    const sentences = text.split(/[.!?]+/).filter(s => s.trim());
    return sentences.slice(0, 2).join('. ').trim() || 'general assistance';
  }

  estimateTokens(text) {
    return Math.ceil(text.length / 4);
  }
}

Summary

Handling long contexts in browser extensions requires smart chunking, hierarchical summarization, sparse attention patterns, and efficient caching. The key is to extract and focus on the most relevant information rather than trying to process everything.

Key strategies:

  • Chunk content semantically, not arbitrarily
  • Create hierarchical summaries for very long documents
  • Use sparse attention to focus on relevant sections
  • Cache embeddings and summaries aggressively
  • Maintain conversation memory with periodic summarization
  • Balance context comprehensiveness with latency and cost