Performance AI Latency Optimization

Dynamic Batching and Latency Management for AI Extensions

E
Extendable Team
· 12 min read

AI-powered browser extensions face a unique challenge: balancing responsiveness with the reality of API latency and rate limits. This guide covers advanced techniques for managing AI requests efficiently, reducing perceived latency, and keeping your extension snappy even under heavy use.

Understanding the Latency Problem

AI API calls typically take 500ms-3s to complete. Without optimization, users experience:

  • Blocking UI during requests
  • Wasted API calls for rapid input changes
  • Rate limit errors under heavy use
  • Poor perceived performance
Latency Breakdown:
  • Network round-trip: 50-200ms
  • Request queuing (server): 0-500ms
  • Token generation: 200ms-2s+
  • Response parsing: 10-50ms

Request Debouncing

Prevent excessive API calls when users type rapidly:

class DebouncedAI {
  constructor(delayMs = 500) {
    this.delay = delayMs;
    this.timeout = null;
    this.pendingPromise = null;
    this.pendingResolve = null;
  }

  async query(prompt) {
    // Cancel previous pending request
    if (this.timeout) {
      clearTimeout(this.timeout);
    }

    // Return existing promise if one is pending
    if (this.pendingPromise) {
      return this.pendingPromise;
    }

    this.pendingPromise = new Promise((resolve) => {
      this.pendingResolve = resolve;

      this.timeout = setTimeout(async () => {
        try {
          const result = await this.executeQuery(prompt);
          resolve(result);
        } catch (error) {
          resolve({ error: error.message });
        } finally {
          this.pendingPromise = null;
          this.pendingResolve = null;
          this.timeout = null;
        }
      }, this.delay);
    });

    return this.pendingPromise;
  }

  async executeQuery(prompt) {
    const response = await fetch('/api/ai', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ prompt })
    });
    return response.json();
  }

  // Cancel any pending request
  cancel() {
    if (this.timeout) {
      clearTimeout(this.timeout);
      this.timeout = null;
    }
    if (this.pendingResolve) {
      this.pendingResolve({ cancelled: true });
      this.pendingPromise = null;
      this.pendingResolve = null;
    }
  }
}

// Usage
const ai = new DebouncedAI(300);

inputField.addEventListener('input', async (e) => {
  const result = await ai.query(e.target.value);
  if (!result.cancelled) {
    displayResult(result);
  }
});

Request Batching

Combine multiple requests into a single API call:

class BatchedAI {
  constructor(options = {}) {
    this.batchSize = options.batchSize || 5;
    this.batchDelayMs = options.batchDelayMs || 100;
    this.queue = [];
    this.processing = false;
    this.batchTimeout = null;
  }

  async query(prompt, context = {}) {
    return new Promise((resolve, reject) => {
      this.queue.push({ prompt, context, resolve, reject });

      // Start batch timer if not already running
      if (!this.batchTimeout) {
        this.batchTimeout = setTimeout(() => this.processBatch(), this.batchDelayMs);
      }

      // Process immediately if batch is full
      if (this.queue.length >= this.batchSize) {
        clearTimeout(this.batchTimeout);
        this.processBatch();
      }
    });
  }

  async processBatch() {
    if (this.processing || this.queue.length === 0) return;

    this.processing = true;
    this.batchTimeout = null;

    // Take items from queue
    const batch = this.queue.splice(0, this.batchSize);

    try {
      // Send all prompts in a single request
      const response = await fetch('/api/ai/batch', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({
          requests: batch.map((item, index) => ({
            id: index,
            prompt: item.prompt,
            context: item.context
          }))
        })
      });

      const results = await response.json();

      // Resolve individual promises
      batch.forEach((item, index) => {
        if (results[index]?.error) {
          item.reject(new Error(results[index].error));
        } else {
          item.resolve(results[index]);
        }
      });
    } catch (error) {
      // Reject all items in batch
      batch.forEach(item => item.reject(error));
    } finally {
      this.processing = false;

      // Process remaining items
      if (this.queue.length > 0) {
        this.batchTimeout = setTimeout(() => this.processBatch(), this.batchDelayMs);
      }
    }
  }
}

Priority Queue

Handle urgent requests ahead of background tasks:

class PriorityAIQueue {
  constructor() {
    this.queues = {
      high: [],    // User-initiated, visible UI
      normal: [],  // Background enrichment
      low: []      // Prefetching, analytics
    };
    this.processing = false;
    this.concurrency = 2;
    this.activeRequests = 0;
  }

  async enqueue(prompt, priority = 'normal') {
    return new Promise((resolve, reject) => {
      this.queues[priority].push({ prompt, resolve, reject, addedAt: Date.now() });
      this.processQueue();
    });
  }

  async processQueue() {
    if (this.activeRequests >= this.concurrency) return;

    // Get next item by priority
    const item = this.getNextItem();
    if (!item) return;

    this.activeRequests++;

    try {
      const result = await this.executeRequest(item.prompt);
      item.resolve(result);
    } catch (error) {
      item.reject(error);
    } finally {
      this.activeRequests--;
      this.processQueue(); // Process next item
    }
  }

  getNextItem() {
    // Check queues in priority order
    for (const priority of ['high', 'normal', 'low']) {
      if (this.queues[priority].length > 0) {
        return this.queues[priority].shift();
      }
    }
    return null;
  }

  // Bump priority of waiting request
  prioritize(prompt) {
    for (const priority of ['normal', 'low']) {
      const index = this.queues[priority].findIndex(item => item.prompt === prompt);
      if (index !== -1) {
        const [item] = this.queues[priority].splice(index, 1);
        this.queues['high'].push(item);
        return true;
      }
    }
    return false;
  }

  // Cancel low-priority items
  clearLowPriority() {
    const cancelled = this.queues['low'].length;
    this.queues['low'].forEach(item => {
      item.reject(new Error('Cancelled'));
    });
    this.queues['low'] = [];
    return cancelled;
  }

  async executeRequest(prompt) {
    const response = await fetch('/api/ai', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ prompt })
    });
    return response.json();
  }
}

// Usage
const queue = new PriorityAIQueue();

// User clicks button - high priority
button.onclick = async () => {
  const result = await queue.enqueue(prompt, 'high');
  showResult(result);
};

// Background enrichment - normal priority
async function enrichPageData() {
  const result = await queue.enqueue(prompt, 'normal');
  cacheEnrichment(result);
}

// Prefetch likely next requests - low priority
async function prefetchSuggestions() {
  await queue.enqueue(prompt, 'low');
}
Priority Guidelines:
  • High: User is waiting, UI is blocked
  • Normal: Background task, result needed soon
  • Low: Prefetching, can be cancelled

Streaming Responses

Reduce perceived latency by showing results as they arrive:

class StreamingAI {
  async query(prompt, onChunk) {
    const response = await fetch('/api/ai/stream', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ prompt })
    });

    const reader = response.body.getReader();
    const decoder = new TextDecoder();
    let buffer = '';
    let fullResponse = '';

    while (true) {
      const { done, value } = await reader.read();

      if (done) break;

      buffer += decoder.decode(value, { stream: true });

      // Process complete lines (SSE format)
      const lines = buffer.split('\n');
      buffer = lines.pop(); // Keep incomplete line in buffer

      for (const line of lines) {
        if (line.startsWith('data: ')) {
          const data = line.slice(6);
          if (data === '[DONE]') continue;

          try {
            const parsed = JSON.parse(data);
            const chunk = parsed.choices?.[0]?.delta?.content || '';
            fullResponse += chunk;
            onChunk(chunk, fullResponse);
          } catch (e) {
            // Ignore parse errors
          }
        }
      }
    }

    return fullResponse;
  }
}

// Usage with progressive rendering
const ai = new StreamingAI();

ai.query('Explain quantum computing', (chunk, full) => {
  outputElement.textContent = full;
  // Optionally render markdown progressively
  outputElement.innerHTML = marked.parse(full);
});

Smart Caching

Cache responses to avoid redundant API calls:

class AICache {
  constructor(options = {}) {
    this.maxSize = options.maxSize || 100;
    this.ttl = options.ttl || 3600000; // 1 hour default
    this.cache = new Map();
  }

  getCacheKey(prompt, context = {}) {
    // Normalize prompt for better cache hits
    const normalized = prompt.toLowerCase().trim().replace(/\s+/g, ' ');
    return JSON.stringify({ prompt: normalized, ...context });
  }

  get(prompt, context) {
    const key = this.getCacheKey(prompt, context);
    const entry = this.cache.get(key);

    if (!entry) return null;

    // Check TTL
    if (Date.now() - entry.timestamp > this.ttl) {
      this.cache.delete(key);
      return null;
    }

    // Move to end (LRU)
    this.cache.delete(key);
    this.cache.set(key, entry);

    return entry.response;
  }

  set(prompt, context, response) {
    const key = this.getCacheKey(prompt, context);

    // Evict oldest if at capacity
    if (this.cache.size >= this.maxSize) {
      const firstKey = this.cache.keys().next().value;
      this.cache.delete(firstKey);
    }

    this.cache.set(key, {
      response,
      timestamp: Date.now()
    });
  }

  // Fuzzy matching for similar prompts
  findSimilar(prompt, threshold = 0.8) {
    const normalized = prompt.toLowerCase().trim();

    for (const [key, entry] of this.cache) {
      const cached = JSON.parse(key).prompt;
      const similarity = this.calculateSimilarity(normalized, cached);

      if (similarity >= threshold) {
        return entry.response;
      }
    }

    return null;
  }

  calculateSimilarity(a, b) {
    const setA = new Set(a.split(' '));
    const setB = new Set(b.split(' '));

    const intersection = [...setA].filter(x => setB.has(x)).length;
    const union = new Set([...setA, ...setB]).size;

    return intersection / union;
  }
}

// Integrated caching
class CachedAI {
  constructor() {
    this.cache = new AICache({ ttl: 1800000 }); // 30 min
  }

  async query(prompt, context = {}) {
    // Check cache first
    const cached = this.cache.get(prompt, context);
    if (cached) {
      return { ...cached, fromCache: true };
    }

    // Check for similar prompts
    const similar = this.cache.findSimilar(prompt);
    if (similar) {
      return { ...similar, fromCache: true, approximate: true };
    }

    // Make actual request
    const response = await this.executeQuery(prompt, context);
    this.cache.set(prompt, context, response);

    return response;
  }

  async executeQuery(prompt, context) {
    const response = await fetch('/api/ai', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ prompt, ...context })
    });
    return response.json();
  }
}

Optimistic UI

Show immediate feedback while waiting for AI response:

class OptimisticAI {
  constructor() {
    this.pendingOperations = new Map();
  }

  async query(prompt, optimisticResponse) {
    const id = crypto.randomUUID();

    // Show optimistic result immediately
    this.pendingOperations.set(id, {
      prompt,
      optimistic: optimisticResponse,
      status: 'pending'
    });

    this.updateUI(id, optimisticResponse, 'pending');

    try {
      const actualResponse = await this.executeQuery(prompt);

      this.pendingOperations.set(id, {
        ...this.pendingOperations.get(id),
        actual: actualResponse,
        status: 'complete'
      });

      this.updateUI(id, actualResponse, 'complete');
      return actualResponse;

    } catch (error) {
      this.pendingOperations.set(id, {
        ...this.pendingOperations.get(id),
        error,
        status: 'error'
      });

      this.updateUI(id, null, 'error', error);
      throw error;
    }
  }

  updateUI(id, response, status, error = null) {
    const element = document.querySelector(`[data-request-id="${id}"]`);
    if (!element) return;

    element.classList.remove('pending', 'complete', 'error');
    element.classList.add(status);

    if (status === 'pending') {
      element.querySelector('.content').textContent = response;
      element.querySelector('.status').textContent = 'Generating...';
    } else if (status === 'complete') {
      element.querySelector('.content').textContent = response.text;
      element.querySelector('.status').textContent = '';
    } else if (status === 'error') {
      element.querySelector('.status').textContent = 'Error: ' + error.message;
    }
  }

  async executeQuery(prompt) {
    const response = await fetch('/api/ai', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ prompt })
    });
    return response.json();
  }
}

Summary

Managing AI latency requires a multi-pronged approach: debounce rapid inputs, batch related requests, prioritize user-facing operations, stream responses for perceived speed, and cache aggressively. Combine these techniques based on your extension’s specific usage patterns.

Key optimizations:

  • Debounce input with 300-500ms delay
  • Batch requests within 100ms windows
  • Use priority queues for mixed workloads
  • Stream responses for immediate feedback
  • Cache with fuzzy matching for better hit rates
  • Show optimistic UI while waiting