AI-powered browser extensions face a unique challenge: balancing responsiveness with the reality of API latency and rate limits. This guide covers advanced techniques for managing AI requests efficiently, reducing perceived latency, and keeping your extension snappy even under heavy use.
Understanding the Latency Problem
AI API calls typically take 500ms-3s to complete. Without optimization, users experience:
- Blocking UI during requests
- Wasted API calls for rapid input changes
- Rate limit errors under heavy use
- Poor perceived performance
- Network round-trip: 50-200ms
- Request queuing (server): 0-500ms
- Token generation: 200ms-2s+
- Response parsing: 10-50ms
Request Debouncing
Prevent excessive API calls when users type rapidly:
class DebouncedAI {
constructor(delayMs = 500) {
this.delay = delayMs;
this.timeout = null;
this.pendingPromise = null;
this.pendingResolve = null;
}
async query(prompt) {
// Cancel previous pending request
if (this.timeout) {
clearTimeout(this.timeout);
}
// Return existing promise if one is pending
if (this.pendingPromise) {
return this.pendingPromise;
}
this.pendingPromise = new Promise((resolve) => {
this.pendingResolve = resolve;
this.timeout = setTimeout(async () => {
try {
const result = await this.executeQuery(prompt);
resolve(result);
} catch (error) {
resolve({ error: error.message });
} finally {
this.pendingPromise = null;
this.pendingResolve = null;
this.timeout = null;
}
}, this.delay);
});
return this.pendingPromise;
}
async executeQuery(prompt) {
const response = await fetch('/api/ai', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ prompt })
});
return response.json();
}
// Cancel any pending request
cancel() {
if (this.timeout) {
clearTimeout(this.timeout);
this.timeout = null;
}
if (this.pendingResolve) {
this.pendingResolve({ cancelled: true });
this.pendingPromise = null;
this.pendingResolve = null;
}
}
}
// Usage
const ai = new DebouncedAI(300);
inputField.addEventListener('input', async (e) => {
const result = await ai.query(e.target.value);
if (!result.cancelled) {
displayResult(result);
}
});
Request Batching
Combine multiple requests into a single API call:
class BatchedAI {
constructor(options = {}) {
this.batchSize = options.batchSize || 5;
this.batchDelayMs = options.batchDelayMs || 100;
this.queue = [];
this.processing = false;
this.batchTimeout = null;
}
async query(prompt, context = {}) {
return new Promise((resolve, reject) => {
this.queue.push({ prompt, context, resolve, reject });
// Start batch timer if not already running
if (!this.batchTimeout) {
this.batchTimeout = setTimeout(() => this.processBatch(), this.batchDelayMs);
}
// Process immediately if batch is full
if (this.queue.length >= this.batchSize) {
clearTimeout(this.batchTimeout);
this.processBatch();
}
});
}
async processBatch() {
if (this.processing || this.queue.length === 0) return;
this.processing = true;
this.batchTimeout = null;
// Take items from queue
const batch = this.queue.splice(0, this.batchSize);
try {
// Send all prompts in a single request
const response = await fetch('/api/ai/batch', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
requests: batch.map((item, index) => ({
id: index,
prompt: item.prompt,
context: item.context
}))
})
});
const results = await response.json();
// Resolve individual promises
batch.forEach((item, index) => {
if (results[index]?.error) {
item.reject(new Error(results[index].error));
} else {
item.resolve(results[index]);
}
});
} catch (error) {
// Reject all items in batch
batch.forEach(item => item.reject(error));
} finally {
this.processing = false;
// Process remaining items
if (this.queue.length > 0) {
this.batchTimeout = setTimeout(() => this.processBatch(), this.batchDelayMs);
}
}
}
}
Priority Queue
Handle urgent requests ahead of background tasks:
class PriorityAIQueue {
constructor() {
this.queues = {
high: [], // User-initiated, visible UI
normal: [], // Background enrichment
low: [] // Prefetching, analytics
};
this.processing = false;
this.concurrency = 2;
this.activeRequests = 0;
}
async enqueue(prompt, priority = 'normal') {
return new Promise((resolve, reject) => {
this.queues[priority].push({ prompt, resolve, reject, addedAt: Date.now() });
this.processQueue();
});
}
async processQueue() {
if (this.activeRequests >= this.concurrency) return;
// Get next item by priority
const item = this.getNextItem();
if (!item) return;
this.activeRequests++;
try {
const result = await this.executeRequest(item.prompt);
item.resolve(result);
} catch (error) {
item.reject(error);
} finally {
this.activeRequests--;
this.processQueue(); // Process next item
}
}
getNextItem() {
// Check queues in priority order
for (const priority of ['high', 'normal', 'low']) {
if (this.queues[priority].length > 0) {
return this.queues[priority].shift();
}
}
return null;
}
// Bump priority of waiting request
prioritize(prompt) {
for (const priority of ['normal', 'low']) {
const index = this.queues[priority].findIndex(item => item.prompt === prompt);
if (index !== -1) {
const [item] = this.queues[priority].splice(index, 1);
this.queues['high'].push(item);
return true;
}
}
return false;
}
// Cancel low-priority items
clearLowPriority() {
const cancelled = this.queues['low'].length;
this.queues['low'].forEach(item => {
item.reject(new Error('Cancelled'));
});
this.queues['low'] = [];
return cancelled;
}
async executeRequest(prompt) {
const response = await fetch('/api/ai', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ prompt })
});
return response.json();
}
}
// Usage
const queue = new PriorityAIQueue();
// User clicks button - high priority
button.onclick = async () => {
const result = await queue.enqueue(prompt, 'high');
showResult(result);
};
// Background enrichment - normal priority
async function enrichPageData() {
const result = await queue.enqueue(prompt, 'normal');
cacheEnrichment(result);
}
// Prefetch likely next requests - low priority
async function prefetchSuggestions() {
await queue.enqueue(prompt, 'low');
}
- High: User is waiting, UI is blocked
- Normal: Background task, result needed soon
- Low: Prefetching, can be cancelled
Streaming Responses
Reduce perceived latency by showing results as they arrive:
class StreamingAI {
async query(prompt, onChunk) {
const response = await fetch('/api/ai/stream', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ prompt })
});
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
let fullResponse = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
// Process complete lines (SSE format)
const lines = buffer.split('\n');
buffer = lines.pop(); // Keep incomplete line in buffer
for (const line of lines) {
if (line.startsWith('data: ')) {
const data = line.slice(6);
if (data === '[DONE]') continue;
try {
const parsed = JSON.parse(data);
const chunk = parsed.choices?.[0]?.delta?.content || '';
fullResponse += chunk;
onChunk(chunk, fullResponse);
} catch (e) {
// Ignore parse errors
}
}
}
}
return fullResponse;
}
}
// Usage with progressive rendering
const ai = new StreamingAI();
ai.query('Explain quantum computing', (chunk, full) => {
outputElement.textContent = full;
// Optionally render markdown progressively
outputElement.innerHTML = marked.parse(full);
});
Smart Caching
Cache responses to avoid redundant API calls:
class AICache {
constructor(options = {}) {
this.maxSize = options.maxSize || 100;
this.ttl = options.ttl || 3600000; // 1 hour default
this.cache = new Map();
}
getCacheKey(prompt, context = {}) {
// Normalize prompt for better cache hits
const normalized = prompt.toLowerCase().trim().replace(/\s+/g, ' ');
return JSON.stringify({ prompt: normalized, ...context });
}
get(prompt, context) {
const key = this.getCacheKey(prompt, context);
const entry = this.cache.get(key);
if (!entry) return null;
// Check TTL
if (Date.now() - entry.timestamp > this.ttl) {
this.cache.delete(key);
return null;
}
// Move to end (LRU)
this.cache.delete(key);
this.cache.set(key, entry);
return entry.response;
}
set(prompt, context, response) {
const key = this.getCacheKey(prompt, context);
// Evict oldest if at capacity
if (this.cache.size >= this.maxSize) {
const firstKey = this.cache.keys().next().value;
this.cache.delete(firstKey);
}
this.cache.set(key, {
response,
timestamp: Date.now()
});
}
// Fuzzy matching for similar prompts
findSimilar(prompt, threshold = 0.8) {
const normalized = prompt.toLowerCase().trim();
for (const [key, entry] of this.cache) {
const cached = JSON.parse(key).prompt;
const similarity = this.calculateSimilarity(normalized, cached);
if (similarity >= threshold) {
return entry.response;
}
}
return null;
}
calculateSimilarity(a, b) {
const setA = new Set(a.split(' '));
const setB = new Set(b.split(' '));
const intersection = [...setA].filter(x => setB.has(x)).length;
const union = new Set([...setA, ...setB]).size;
return intersection / union;
}
}
// Integrated caching
class CachedAI {
constructor() {
this.cache = new AICache({ ttl: 1800000 }); // 30 min
}
async query(prompt, context = {}) {
// Check cache first
const cached = this.cache.get(prompt, context);
if (cached) {
return { ...cached, fromCache: true };
}
// Check for similar prompts
const similar = this.cache.findSimilar(prompt);
if (similar) {
return { ...similar, fromCache: true, approximate: true };
}
// Make actual request
const response = await this.executeQuery(prompt, context);
this.cache.set(prompt, context, response);
return response;
}
async executeQuery(prompt, context) {
const response = await fetch('/api/ai', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ prompt, ...context })
});
return response.json();
}
}
Optimistic UI
Show immediate feedback while waiting for AI response:
class OptimisticAI {
constructor() {
this.pendingOperations = new Map();
}
async query(prompt, optimisticResponse) {
const id = crypto.randomUUID();
// Show optimistic result immediately
this.pendingOperations.set(id, {
prompt,
optimistic: optimisticResponse,
status: 'pending'
});
this.updateUI(id, optimisticResponse, 'pending');
try {
const actualResponse = await this.executeQuery(prompt);
this.pendingOperations.set(id, {
...this.pendingOperations.get(id),
actual: actualResponse,
status: 'complete'
});
this.updateUI(id, actualResponse, 'complete');
return actualResponse;
} catch (error) {
this.pendingOperations.set(id, {
...this.pendingOperations.get(id),
error,
status: 'error'
});
this.updateUI(id, null, 'error', error);
throw error;
}
}
updateUI(id, response, status, error = null) {
const element = document.querySelector(`[data-request-id="${id}"]`);
if (!element) return;
element.classList.remove('pending', 'complete', 'error');
element.classList.add(status);
if (status === 'pending') {
element.querySelector('.content').textContent = response;
element.querySelector('.status').textContent = 'Generating...';
} else if (status === 'complete') {
element.querySelector('.content').textContent = response.text;
element.querySelector('.status').textContent = '';
} else if (status === 'error') {
element.querySelector('.status').textContent = 'Error: ' + error.message;
}
}
async executeQuery(prompt) {
const response = await fetch('/api/ai', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ prompt })
});
return response.json();
}
}
Summary
Managing AI latency requires a multi-pronged approach: debounce rapid inputs, batch related requests, prioritize user-facing operations, stream responses for perceived speed, and cache aggressively. Combine these techniques based on your extension’s specific usage patterns.
Key optimizations:
- Debounce input with 300-500ms delay
- Batch requests within 100ms windows
- Use priority queues for mixed workloads
- Stream responses for immediate feedback
- Cache with fuzzy matching for better hit rates
- Show optimistic UI while waiting