Retrieval-Augmented Generation (RAG) combines the power of large language models with the precision of information retrieval. For browser extensions, RAG enables AI features that are grounded in specific, relevant context rather than relying solely on the model’s training data. This guide shows you how to implement RAG patterns effectively in your extensions.
What is RAG?
RAG works in three steps:
- Retrieve: Find relevant information from a knowledge base
- Augment: Add this information to the AI prompt
- Generate: Let the AI produce a response using both its training and the retrieved context
RAG Architecture for Extensions
┌─────────────────────────────────────────────────────┐
│ Browser Extension │
├─────────────────────────────────────────────────────┤
│ ┌──────────┐ ┌──────────┐ ┌──────────────┐ │
│ │ Content │───▶│ Indexer │───▶│ Vector Store │ │
│ │ Extractor│ │ │ │ (Local DB) │ │
│ └──────────┘ └──────────┘ └──────────────┘ │
│ │ │ │
│ │ ┌──────────┐ │ │
│ └────────▶│ Retriever│◀────────┘ │
│ └────┬─────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────┐ │
│ │ Prompt Builder │ │
│ │ (Query + Context) │ │
│ └──────────┬──────────┘ │
│ │ │
└───────────────────────┼────────────────────────────┘
│
▼
┌─────────────────┐
│ AI Service │
│ (GPT/Claude) │
└─────────────────┘
Implementing Local Vector Storage
For privacy and performance, store embeddings locally using IndexedDB:
// vector-store.js
class LocalVectorStore {
constructor(dbName = 'rag-vectors') {
this.dbName = dbName;
this.db = null;
}
async initialize() {
return new Promise((resolve, reject) => {
const request = indexedDB.open(this.dbName, 1);
request.onerror = () => reject(request.error);
request.onsuccess = () => {
this.db = request.result;
resolve();
};
request.onupgradeneeded = (event) => {
const db = event.target.result;
const store = db.createObjectStore('vectors', { keyPath: 'id' });
store.createIndex('source', 'source', { unique: false });
store.createIndex('timestamp', 'timestamp', { unique: false });
};
});
}
async addVector(id, vector, metadata) {
return new Promise((resolve, reject) => {
const transaction = this.db.transaction(['vectors'], 'readwrite');
const store = transaction.objectStore('vectors');
store.put({
id,
vector,
metadata,
timestamp: Date.now()
});
transaction.oncomplete = () => resolve();
transaction.onerror = () => reject(transaction.error);
});
}
async search(queryVector, topK = 5) {
return new Promise((resolve, reject) => {
const transaction = this.db.transaction(['vectors'], 'readonly');
const store = transaction.objectStore('vectors');
const request = store.getAll();
request.onsuccess = () => {
const results = request.result
.map(item => ({
...item,
similarity: this.cosineSimilarity(queryVector, item.vector)
}))
.sort((a, b) => b.similarity - a.similarity)
.slice(0, topK);
resolve(results);
};
request.onerror = () => reject(request.error);
});
}
cosineSimilarity(a, b) {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
}
Generating Embeddings
You can generate embeddings locally using lightweight models or via API:
// embeddings.js
class EmbeddingService {
constructor(options = {}) {
this.useLocal = options.useLocal || false;
this.model = null;
}
async initialize() {
if (this.useLocal) {
// Use TensorFlow.js Universal Sentence Encoder
const use = await import('@tensorflow-models/universal-sentence-encoder');
this.model = await use.load();
}
}
async embed(text) {
if (this.useLocal && this.model) {
const embeddings = await this.model.embed([text]);
return Array.from(await embeddings.data());
}
// Fallback to API
const response = await fetch('https://api.openai.com/v1/embeddings', {
method: 'POST',
headers: {
'Authorization': `Bearer ${await this.getApiKey()}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'text-embedding-3-small',
input: text
})
});
const data = await response.json();
return data.data[0].embedding;
}
async getApiKey() {
const { apiKey } = await chrome.storage.sync.get('apiKey');
return apiKey;
}
}
Content Extraction and Indexing
Build a system to extract and index content from web pages:
// content-indexer.js
class ContentIndexer {
constructor(vectorStore, embeddingService) {
this.vectorStore = vectorStore;
this.embeddingService = embeddingService;
}
async indexPage(url, content) {
// Split content into chunks
const chunks = this.chunkContent(content);
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const vector = await this.embeddingService.embed(chunk.text);
await this.vectorStore.addVector(
`${url}#chunk-${i}`,
vector,
{
url,
title: document.title,
text: chunk.text,
position: i,
totalChunks: chunks.length
}
);
}
}
chunkContent(content, maxChunkSize = 500) {
const chunks = [];
const paragraphs = content.split(/\n\n+/);
let currentChunk = '';
for (const paragraph of paragraphs) {
if (currentChunk.length + paragraph.length > maxChunkSize) {
if (currentChunk) {
chunks.push({ text: currentChunk.trim() });
}
currentChunk = paragraph;
} else {
currentChunk += '\n\n' + paragraph;
}
}
if (currentChunk) {
chunks.push({ text: currentChunk.trim() });
}
return chunks;
}
extractPageContent() {
// Remove scripts, styles, and navigation
const clone = document.body.cloneNode(true);
const remove = clone.querySelectorAll('script, style, nav, header, footer, aside');
remove.forEach(el => el.remove());
// Get main content
const main = clone.querySelector('main, article, [role="main"]') || clone;
return main.textContent.replace(/\s+/g, ' ').trim();
}
}
Building the RAG Pipeline
Combine retrieval and generation into a unified pipeline:
// rag-pipeline.js
class RAGPipeline {
constructor(vectorStore, embeddingService) {
this.vectorStore = vectorStore;
this.embeddingService = embeddingService;
}
async query(userQuestion, options = {}) {
const { topK = 3, includeHistory = true } = options;
// Step 1: Embed the question
const questionVector = await this.embeddingService.embed(userQuestion);
// Step 2: Retrieve relevant context
const relevantDocs = await this.vectorStore.search(questionVector, topK);
// Step 3: Build augmented prompt
const prompt = this.buildPrompt(userQuestion, relevantDocs, includeHistory);
// Step 4: Generate response
const response = await this.generateResponse(prompt);
return {
answer: response,
sources: relevantDocs.map(d => ({
url: d.metadata.url,
title: d.metadata.title,
relevance: d.similarity
}))
};
}
buildPrompt(question, documents, includeHistory) {
let context = documents
.map(doc => `Source: ${doc.metadata.title}\n${doc.metadata.text}`)
.join('\n\n---\n\n');
return `You are a helpful assistant. Answer the user's question based on the following context from their browsing history. If the context doesn't contain relevant information, say so.
## Context
${context}
## Question
${question}
## Instructions
- Base your answer on the provided context
- Cite sources when possible
- If you're unsure, acknowledge uncertainty
- Be concise but thorough`;
}
async generateResponse(prompt) {
const response = await fetch('https://api.anthropic.com/v1/messages', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-api-key': await this.getApiKey(),
'anthropic-version': '2023-06-01'
},
body: JSON.stringify({
model: 'claude-3-haiku-20240307',
max_tokens: 1024,
messages: [{ role: 'user', content: prompt }]
})
});
const data = await response.json();
return data.content[0].text;
}
async getApiKey() {
const { apiKey } = await chrome.storage.sync.get('apiKey');
return apiKey;
}
}
Optimizing Retrieval Quality
Hybrid Search
Combine semantic search with keyword matching for better results:
async function hybridSearch(query, topK = 5) {
// Semantic search
const queryVector = await embeddingService.embed(query);
const semanticResults = await vectorStore.search(queryVector, topK * 2);
// Keyword search
const keywords = extractKeywords(query);
const keywordResults = await keywordSearch(keywords, topK * 2);
// Merge and re-rank
const merged = mergeResults(semanticResults, keywordResults);
// Re-rank using cross-encoder or simple scoring
const reranked = rerank(query, merged);
return reranked.slice(0, topK);
}
function extractKeywords(text) {
const stopWords = new Set(['the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of']);
return text
.toLowerCase()
.split(/\W+/)
.filter(word => word.length > 2 && !stopWords.has(word));
}
function mergeResults(semantic, keyword) {
const seen = new Set();
const merged = [];
for (const result of [...semantic, ...keyword]) {
if (!seen.has(result.id)) {
seen.add(result.id);
merged.push(result);
}
}
return merged;
}
Query Expansion
Improve retrieval by expanding the query:
async function expandQuery(originalQuery) {
const expansion = await generateQueryExpansion(originalQuery);
return {
original: originalQuery,
expanded: expansion,
combined: `${originalQuery} ${expansion}`
};
}
async function generateQueryExpansion(query) {
const response = await fetch('https://api.openai.com/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': `Bearer ${apiKey}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'gpt-3.5-turbo',
messages: [{
role: 'system',
content: 'Generate 3-5 related search terms for the given query. Return only the terms, comma-separated.'
}, {
role: 'user',
content: query
}],
max_tokens: 50
})
});
const data = await response.json();
return data.choices[0].message.content;
}
Managing the Knowledge Base
Automatic Indexing
Index pages as users browse:
// content-script.js
async function autoIndex() {
// Check if this page type should be indexed
const { indexSettings } = await chrome.storage.sync.get('indexSettings');
const shouldIndex = indexSettings?.autoIndex &&
!isExcludedDomain(location.hostname, indexSettings.excludedDomains);
if (!shouldIndex) return;
// Wait for page to fully load
await new Promise(resolve => {
if (document.readyState === 'complete') resolve();
else window.addEventListener('load', resolve);
});
// Extract and send content for indexing
const content = extractPageContent();
chrome.runtime.sendMessage({
type: 'INDEX_PAGE',
payload: {
url: location.href,
title: document.title,
content
}
});
}
autoIndex();
Storage Management
Keep the vector store size manageable:
async function pruneOldVectors(maxAge = 30 * 24 * 60 * 60 * 1000) {
const cutoff = Date.now() - maxAge;
const transaction = db.transaction(['vectors'], 'readwrite');
const store = transaction.objectStore('vectors');
const index = store.index('timestamp');
const range = IDBKeyRange.upperBound(cutoff);
const request = index.openCursor(range);
request.onsuccess = (event) => {
const cursor = event.target.result;
if (cursor) {
cursor.delete();
cursor.continue();
}
};
}
// Run weekly
chrome.alarms.create('prune-vectors', { periodInMinutes: 60 * 24 * 7 });
chrome.alarms.onAlarm.addListener((alarm) => {
if (alarm.name === 'prune-vectors') {
pruneOldVectors();
}
});
Summary
RAG transforms browser extensions from simple tools into intelligent assistants that understand user context. By combining local vector storage, smart retrieval, and LLM generation, you can build extensions that provide highly relevant, grounded responses.
Key implementation points:
- Use IndexedDB for local vector storage
- Balance local vs API embeddings based on your needs
- Implement hybrid search for better retrieval
- Give users control over their knowledge base
- Prune old data to manage storage