/** * Deduplication utilities for palace memories. * * Detects near-duplicate content using cosine similarity over embeddings. * Products handle the Cosmos/DB queries; this module operates on pure data. */ import { cosineSimilarity } from './cosine.js'; /** * Check if a candidate embedding is a near-duplicate of any existing embedding. * * @param candidate - Embedding of the new memory * @param existingEmbeddings - Embeddings of existing memories in the same room/hall * @param threshold - Cosine similarity threshold (default: 0.90) * @returns true if any existing embedding exceeds the threshold */ export function isContentDuplicate( candidate: number[], existingEmbeddings: number[][], threshold = 0.9 ): boolean { for (const existing of existingEmbeddings) { if (existing.length !== candidate.length) continue; if (cosineSimilarity(candidate, existing) > threshold) { return true; } } return false; } /** * Check if two text strings are exact duplicates after normalization. * Trims whitespace and lowercases before comparison. */ export function isExactDuplicate(a: string, b: string): boolean { return a.trim().toLowerCase() === b.trim().toLowerCase(); } /** * Find the most similar embedding and return its index + score. * Returns null if no embeddings exist or none exceed minScore. */ export function findClosestMatch( candidate: number[], existingEmbeddings: number[][], minScore = 0 ): { index: number; score: number } | null { let bestIndex = -1; let bestScore = minScore; for (let i = 0; i < existingEmbeddings.length; i++) { const existing = existingEmbeddings[i]; if (existing.length !== candidate.length) continue; const score = cosineSimilarity(candidate, existing); if (score > bestScore) { bestScore = score; bestIndex = i; } } return bestIndex >= 0 ? { index: bestIndex, score: bestScore } : null; }