/** * Vector similarity utilities for semantic search and deduplication. */ /** * Compute cosine similarity between two vectors. * Returns a value between -1 and 1 (1 = identical direction). * Returns 0 if either vector is zero-length or dimensions don't match. */ export function cosineSimilarity(a: number[], b: number[]): number { if (a.length !== b.length || a.length === 0) return 0; let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } const denominator = Math.sqrt(normA) * Math.sqrt(normB); if (denominator === 0) return 0; return dotProduct / denominator; } /** * Normalize a vector to unit length (magnitude = 1). * Returns a zero vector if input is zero-length. */ export function normalizeVector(v: number[]): number[] { const magnitude = Math.sqrt(v.reduce((sum, val) => sum + val * val, 0)); if (magnitude === 0) return v.map(() => 0); return v.map(val => val / magnitude); } /** * Find the top-K most similar items to a query vector. * * @param query - The query embedding vector * @param items - Array of items to search * @param getEmbedding - Function to extract embedding from an item (returns undefined if missing) * @param k - Maximum number of results to return * @param minScore - Minimum cosine similarity score (default: 0) * @returns Sorted array of { item, score } pairs, highest score first */ export function topKByCosine( query: number[], items: T[], getEmbedding: (item: T) => number[] | undefined, k: number, minScore = 0 ): Array<{ item: T; score: number }> { const scored: Array<{ item: T; score: number }> = []; for (const item of items) { const embedding = getEmbedding(item); if (!embedding || embedding.length === 0) continue; const score = cosineSimilarity(query, embedding); if (score >= minScore) { scored.push({ item, score }); } } scored.sort((a, b) => b.score - a.score); return scored.slice(0, k); }