Browser extensions often need to process long documents, maintain conversation history, and synthesize information across multiple tabs. Standard attention mechanisms become impractical for these use cases. This guide covers practical techniques for handling long contexts efficiently in your AI-powered extensions.
The Long Context Challenge
Standard transformer attention scales quadratically with sequence length. For a 128K token context window, this means:
- 128K x 128K = 16 billion attention computations
- Massive memory requirements
- Unacceptable latency for interactive use
- GPT-4 Turbo: 128K tokens (~100 pages)
- Claude: 200K tokens (~150 pages)
- Practical limit: 8-16K before quality degrades
- Cost scales with tokens (input + output)
Chunking Strategies
Semantic Chunking
Split content at meaningful boundaries:
class SemanticChunker {
constructor(options = {}) {
this.maxChunkSize = options.maxChunkSize || 1000; // tokens
this.overlapSize = options.overlapSize || 100;
}
chunk(text) {
// Split by semantic boundaries (paragraphs, sections)
const sections = this.splitBySections(text);
const chunks = [];
for (const section of sections) {
if (this.estimateTokens(section) <= this.maxChunkSize) {
chunks.push(section);
} else {
// Sub-chunk large sections by paragraphs
chunks.push(...this.chunkByParagraphs(section));
}
}
return this.addOverlap(chunks);
}
splitBySections(text) {
// Split by headers, horizontal rules, or large gaps
return text.split(/(?=^#{1,3}\s|\n{3,}|^[-=]{3,}$)/m)
.filter(s => s.trim());
}
chunkByParagraphs(text) {
const paragraphs = text.split(/\n\n+/);
const chunks = [];
let currentChunk = '';
for (const para of paragraphs) {
if (this.estimateTokens(currentChunk + para) > this.maxChunkSize) {
if (currentChunk) chunks.push(currentChunk.trim());
currentChunk = para;
} else {
currentChunk += '\n\n' + para;
}
}
if (currentChunk) chunks.push(currentChunk.trim());
return chunks;
}
addOverlap(chunks) {
return chunks.map((chunk, i) => {
let result = chunk;
// Add overlap from previous chunk
if (i > 0) {
const prevOverlap = this.getTrailingText(chunks[i-1], this.overlapSize);
result = `[...] ${prevOverlap}\n\n${result}`;
}
// Add overlap from next chunk
if (i < chunks.length - 1) {
const nextOverlap = this.getLeadingText(chunks[i+1], this.overlapSize);
result = `${result}\n\n${nextOverlap} [...]`;
}
return result;
});
}
getTrailingText(text, tokens) {
const words = text.split(/\s+/);
return words.slice(-tokens * 0.75).join(' '); // ~0.75 words per token
}
getLeadingText(text, tokens) {
const words = text.split(/\s+/);
return words.slice(0, tokens * 0.75).join(' ');
}
estimateTokens(text) {
// Rough estimation: 1 token ≈ 4 characters
return Math.ceil(text.length / 4);
}
}
Hierarchical Summarization
Create multi-level summaries for very long content:
class HierarchicalSummarizer {
constructor(aiClient) {
this.ai = aiClient;
this.chunkSize = 2000; // tokens
this.summaryRatio = 0.2; // 20% of original
}
async summarize(text) {
const chunks = this.chunkText(text);
// Level 1: Chunk summaries
const chunkSummaries = await Promise.all(
chunks.map(chunk => this.summarizeChunk(chunk))
);
// Level 2: Section summaries (group 5 chunks)
const sectionSummaries = [];
for (let i = 0; i < chunkSummaries.length; i += 5) {
const section = chunkSummaries.slice(i, i + 5).join('\n\n');
sectionSummaries.push(await this.summarizeChunk(section));
}
// Level 3: Document summary
const documentSummary = await this.summarizeChunk(
sectionSummaries.join('\n\n')
);
return {
full: text,
chunks: chunkSummaries,
sections: sectionSummaries,
document: documentSummary,
hierarchy: this.buildHierarchy(chunks, chunkSummaries, sectionSummaries, documentSummary)
};
}
async summarizeChunk(text) {
const response = await this.ai.query({
prompt: `Summarize the following text concisely, preserving key facts and relationships:\n\n${text}`,
maxTokens: Math.ceil(this.estimateTokens(text) * this.summaryRatio)
});
return response.text;
}
buildHierarchy(chunks, chunkSummaries, sectionSummaries, documentSummary) {
return {
type: 'document',
summary: documentSummary,
children: sectionSummaries.map((ss, si) => ({
type: 'section',
summary: ss,
children: chunkSummaries.slice(si * 5, (si + 1) * 5).map((cs, ci) => ({
type: 'chunk',
summary: cs,
content: chunks[si * 5 + ci]
}))
}))
};
}
chunkText(text) {
const chunker = new SemanticChunker({ maxChunkSize: this.chunkSize });
return chunker.chunk(text);
}
estimateTokens(text) {
return Math.ceil(text.length / 4);
}
}
Sliding Window Attention
Process long content with a moving context window:
class SlidingWindowProcessor {
constructor(aiClient, windowSize = 4000, strideSize = 3000) {
this.ai = aiClient;
this.windowSize = windowSize;
this.strideSize = strideSize;
}
async process(text, query) {
const windows = this.createWindows(text);
const results = [];
for (const window of windows) {
const result = await this.processWindow(window, query);
results.push(result);
}
return this.mergeResults(results);
}
createWindows(text) {
const tokens = this.tokenize(text);
const windows = [];
for (let i = 0; i < tokens.length; i += this.strideSize) {
windows.push({
tokens: tokens.slice(i, i + this.windowSize),
startOffset: i,
endOffset: Math.min(i + this.windowSize, tokens.length)
});
// Stop if we've covered the entire text
if (i + this.windowSize >= tokens.length) break;
}
return windows;
}
async processWindow(window, query) {
const text = window.tokens.join(' ');
const response = await this.ai.query({
prompt: `Context:\n${text}\n\nQuery: ${query}\n\nAnswer based only on the context provided:`,
maxTokens: 500
});
return {
response: response.text,
startOffset: window.startOffset,
endOffset: window.endOffset,
confidence: this.extractConfidence(response)
};
}
mergeResults(results) {
// Filter out low-confidence results
const relevant = results.filter(r => r.confidence > 0.3);
if (relevant.length === 0) {
return { answer: "No relevant information found.", sources: [] };
}
// If multiple relevant results, synthesize
if (relevant.length > 1) {
return this.synthesize(relevant);
}
return {
answer: relevant[0].response,
sources: [relevant[0]]
};
}
async synthesize(results) {
const synthesis = await this.ai.query({
prompt: `Synthesize these related findings into a coherent answer:\n\n${
results.map((r, i) => `Finding ${i+1}:\n${r.response}`).join('\n\n')
}\n\nSynthesized answer:`,
maxTokens: 500
});
return {
answer: synthesis.text,
sources: results
};
}
tokenize(text) {
return text.split(/\s+/);
}
extractConfidence(response) {
// Heuristic: check for hedging language
const hedges = ['might', 'possibly', 'unclear', 'not sure', 'cannot find'];
const hasHedge = hedges.some(h => response.text.toLowerCase().includes(h));
return hasHedge ? 0.3 : 0.8;
}
}
- Overlap windows by 20-30% to avoid missing cross-boundary content
- Use larger windows for complex reasoning tasks
- Use smaller windows for fact extraction
- Adjust based on your API's context limit and cost
Sparse Attention Patterns
Implement attention patterns that skip irrelevant content:
class SparseAttentionManager {
constructor() {
this.relevanceScores = new Map();
}
// Score chunks for relevance to query
async scoreChunks(chunks, query, embedder) {
const queryEmbed = await embedder.embed(query);
const scores = await Promise.all(chunks.map(async (chunk, i) => {
const chunkEmbed = await embedder.embed(chunk);
return {
index: i,
score: this.cosineSimilarity(queryEmbed, chunkEmbed)
};
}));
return scores.sort((a, b) => b.score - a.score);
}
// Select chunks using sparse pattern
selectChunks(rankedChunks, budget, pattern = 'topk-plus-boundary') {
const selected = new Set();
switch (pattern) {
case 'topk':
// Just take top K chunks
rankedChunks.slice(0, budget).forEach(c => selected.add(c.index));
break;
case 'topk-plus-boundary':
// Top K plus first and last chunks for context
rankedChunks.slice(0, budget - 2).forEach(c => selected.add(c.index));
selected.add(0); // First chunk
selected.add(rankedChunks.length - 1); // Last chunk
break;
case 'topk-plus-neighbors':
// Top K plus their neighbors for continuity
const topK = Math.floor(budget / 2);
rankedChunks.slice(0, topK).forEach(c => {
selected.add(c.index);
if (c.index > 0) selected.add(c.index - 1);
if (c.index < rankedChunks.length - 1) selected.add(c.index + 1);
});
break;
case 'stratified':
// Sample from different parts of document
const topCount = Math.floor(budget * 0.6);
const stratifiedCount = budget - topCount;
rankedChunks.slice(0, topCount).forEach(c => selected.add(c.index));
const stride = Math.floor(rankedChunks.length / stratifiedCount);
for (let i = 0; i < stratifiedCount; i++) {
selected.add(i * stride);
}
break;
}
return Array.from(selected).sort((a, b) => a - b);
}
cosineSimilarity(a, b) {
let dot = 0, normA = 0, normB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}
}
// Usage example
async function queryLongDocument(document, query, ai, embedder) {
const chunker = new SemanticChunker({ maxChunkSize: 500 });
const chunks = chunker.chunk(document);
const sparseManager = new SparseAttentionManager();
const ranked = await sparseManager.scoreChunks(chunks, query, embedder);
// Select top 8 chunks plus boundaries
const selectedIndices = sparseManager.selectChunks(ranked, 10, 'topk-plus-boundary');
const selectedChunks = selectedIndices.map(i => chunks[i]);
// Build context from selected chunks
const context = selectedChunks.join('\n\n---\n\n');
const response = await ai.query({
prompt: `Document excerpts:\n${context}\n\nQuestion: ${query}\n\nAnswer:`,
maxTokens: 1000
});
return response;
}
Memory-Efficient Caching
Cache processed content to avoid re-computation:
class LongContextCache {
constructor(maxMemoryMB = 50) {
this.cache = new Map();
this.maxMemory = maxMemoryMB * 1024 * 1024;
this.currentMemory = 0;
}
async getOrCompute(key, computeFn) {
if (this.cache.has(key)) {
const entry = this.cache.get(key);
entry.lastAccess = Date.now();
return entry.value;
}
const value = await computeFn();
this.set(key, value);
return value;
}
set(key, value) {
const size = this.estimateSize(value);
// Evict if needed
while (this.currentMemory + size > this.maxMemory && this.cache.size > 0) {
this.evictLRU();
}
this.cache.set(key, {
value,
size,
lastAccess: Date.now()
});
this.currentMemory += size;
}
evictLRU() {
let oldest = null;
let oldestTime = Infinity;
for (const [key, entry] of this.cache) {
if (entry.lastAccess < oldestTime) {
oldest = key;
oldestTime = entry.lastAccess;
}
}
if (oldest) {
this.currentMemory -= this.cache.get(oldest).size;
this.cache.delete(oldest);
}
}
estimateSize(value) {
return JSON.stringify(value).length * 2; // Rough estimate
}
// Cache document hierarchies
async getCachedHierarchy(url, document, summarizer) {
const cacheKey = `hierarchy:${url}:${this.hashContent(document)}`;
return this.getOrCompute(cacheKey, () => summarizer.summarize(document));
}
// Cache embeddings
async getCachedEmbeddings(chunks, embedder) {
const results = [];
for (const chunk of chunks) {
const key = `embed:${this.hashContent(chunk)}`;
const embedding = await this.getOrCompute(key, () => embedder.embed(chunk));
results.push(embedding);
}
return results;
}
hashContent(content) {
// Simple hash for cache key
let hash = 0;
for (let i = 0; i < Math.min(content.length, 1000); i++) {
hash = ((hash << 5) - hash) + content.charCodeAt(i);
hash |= 0;
}
return hash.toString(36);
}
}
Conversation Memory
Maintain long conversation context efficiently:
class ConversationMemory {
constructor(options = {}) {
this.maxTurns = options.maxTurns || 20;
this.summaryThreshold = options.summaryThreshold || 10;
this.turns = [];
this.summaries = [];
}
addTurn(role, content) {
this.turns.push({ role, content, timestamp: Date.now() });
// Summarize old turns when threshold reached
if (this.turns.length > this.summaryThreshold) {
this.compressOldTurns();
}
}
async compressOldTurns() {
const toCompress = this.turns.splice(0, this.turns.length - 5);
const summary = await this.summarizeTurns(toCompress);
this.summaries.push(summary);
}
async summarizeTurns(turns) {
const transcript = turns.map(t => `${t.role}: ${t.content}`).join('\n');
// Use AI to summarize
return `[Previous conversation summary: The user and assistant discussed ${
this.extractTopics(transcript)
}. Key points: ${this.extractKeyPoints(transcript)}]`;
}
buildContext(maxTokens = 4000) {
let context = '';
let tokenCount = 0;
// Add summaries first
for (const summary of this.summaries) {
const tokens = this.estimateTokens(summary);
if (tokenCount + tokens > maxTokens * 0.3) break;
context += summary + '\n\n';
tokenCount += tokens;
}
// Add recent turns
const recentTurns = [...this.turns].reverse();
for (const turn of recentTurns) {
const formatted = `${turn.role}: ${turn.content}`;
const tokens = this.estimateTokens(formatted);
if (tokenCount + tokens > maxTokens) break;
context = formatted + '\n' + context;
tokenCount += tokens;
}
return context;
}
extractTopics(text) {
// Simple topic extraction
const topics = new Set();
const topicPatterns = [
/discuss(?:ed|ing)?\s+(\w+(?:\s+\w+)?)/gi,
/about\s+(\w+(?:\s+\w+)?)/gi,
/help(?:ed|ing)?\s+with\s+(\w+(?:\s+\w+)?)/gi
];
for (const pattern of topicPatterns) {
let match;
while ((match = pattern.exec(text)) !== null) {
topics.add(match[1].toLowerCase());
}
}
return Array.from(topics).slice(0, 3).join(', ') || 'various topics';
}
extractKeyPoints(text) {
const sentences = text.split(/[.!?]+/).filter(s => s.trim());
return sentences.slice(0, 2).join('. ').trim() || 'general assistance';
}
estimateTokens(text) {
return Math.ceil(text.length / 4);
}
}
Summary
Handling long contexts in browser extensions requires smart chunking, hierarchical summarization, sparse attention patterns, and efficient caching. The key is to extract and focus on the most relevant information rather than trying to process everything.
Key strategies:
- Chunk content semantically, not arbitrarily
- Create hierarchical summaries for very long documents
- Use sparse attention to focus on relevant sections
- Cache embeddings and summaries aggressively
- Maintain conversation memory with periodic summarization
- Balance context comprehensiveness with latency and cost