feat(backend): add note intelligence — embeddings, auto-summarize, duplicates, suggest-links, knowledge gaps

Phase 2 of Smart Actions Roadmap:

- Create lib/embeddings.ts: embedText(), cosineSimilarity(), stripHtmlForEmbedding()
- Add embedding + summaryArtifactId fields to NoteDoc
- Create lib/note-hooks.ts: runPostSaveHooks() for background AI enrichment
  - backgroundEmbed: compute and store note embedding vectors
  - backgroundAutoSummarize: generate summary artifact for notes > 300 words
  - Both gated behind feature flags (notelett_auto_embed_enabled, notelett_auto_summarize_enabled)
- Add intelligence endpoints to note-prompts routes:
  - POST /api/notes/:id/suggest-tags (F5) — LLM-generated tag suggestions
  - POST /api/notes/:id/check-duplicates (F8) — cosine similarity duplicate detection
  - POST /api/notes/:id/suggest-links (F9) — related note suggestions
  - POST /api/workspaces/:wsId/knowledge-gaps (F12) — workspace gap analysis
  - POST /api/notes/compare (F14) — multi-note comparison
  - POST /api/notes/merge (F13) — multi-note merge
- Add 4 feature flags for intelligence features
- 9 new tests in embeddings.test.ts (cosine similarity, HTML stripping, embedText)
This commit is contained in:
saravanakumardb1 2026-04-06 08:10:26 -07:00
parent f015ae6f20
commit fe3b0f9b3e
6 changed files with 559 additions and 0 deletions

View File

@ -0,0 +1,81 @@
/**
* Tests for embeddings utility embedText + cosineSimilarity + stripHtmlForEmbedding.
*/
import { describe, it, expect, vi } from 'vitest';
import { cosineSimilarity, stripHtmlForEmbedding } from './embeddings.js';
vi.mock('./llm.js', () => ({
llm: vi.fn(() => ({
embed: vi.fn(async () => ({
embeddings: [[0.1, 0.2, 0.3, 0.4, 0.5]],
model: 'mock-embed',
usage: { promptTokens: 5, completionTokens: 0, totalTokens: 5 },
})),
})),
}));
vi.mock('./config.js', () => ({
config: { LLM_EMBEDDING_MODEL: 'text-embedding-3-small' },
}));
describe('cosineSimilarity', () => {
it('returns 1 for identical vectors', () => {
const v = [1, 2, 3, 4];
expect(cosineSimilarity(v, v)).toBeCloseTo(1, 5);
});
it('returns 0 for orthogonal vectors', () => {
expect(cosineSimilarity([1, 0], [0, 1])).toBeCloseTo(0, 5);
});
it('returns -1 for opposite vectors', () => {
expect(cosineSimilarity([1, 0], [-1, 0])).toBeCloseTo(-1, 5);
});
it('returns 0 for empty vectors', () => {
expect(cosineSimilarity([], [])).toBe(0);
});
it('returns 0 for mismatched lengths', () => {
expect(cosineSimilarity([1, 2], [1, 2, 3])).toBe(0);
});
it('returns 0 for zero vectors', () => {
expect(cosineSimilarity([0, 0, 0], [1, 2, 3])).toBe(0);
});
it('computes correct similarity for known vectors', () => {
const a = [1, 2, 3];
const b = [4, 5, 6];
// dot = 4+10+18 = 32, |a| = sqrt(14), |b| = sqrt(77)
const expected = 32 / (Math.sqrt(14) * Math.sqrt(77));
expect(cosineSimilarity(a, b)).toBeCloseTo(expected, 5);
});
});
describe('stripHtmlForEmbedding', () => {
it('strips HTML tags', () => {
expect(stripHtmlForEmbedding('<p>Hello <b>world</b></p>')).toBe('Hello world');
});
it('collapses whitespace', () => {
expect(stripHtmlForEmbedding('Hello \n\n world')).toBe('Hello world');
});
it('handles empty string', () => {
expect(stripHtmlForEmbedding('')).toBe('');
});
it('strips complex HTML', () => {
const html = '<div class="note"><h1>Title</h1><p>Content with <a href="#">link</a></p></div>';
expect(stripHtmlForEmbedding(html)).toBe('Title Content with link');
});
});
describe('embedText', () => {
it('returns embedding vector', async () => {
const { embedText } = await import('./embeddings.js');
const result = await embedText('hello world');
expect(result).toEqual([0.1, 0.2, 0.3, 0.4, 0.5]);
});
});

View File

@ -0,0 +1,50 @@
/**
* Embedding utilities for note intelligence duplicate detection, related notes, knowledge gaps.
*
* Uses @bytelyst/llm embed() when available, falls back gracefully.
*/
import { llm } from './llm.js';
import { config } from './config.js';
/**
* Generate an embedding vector for a text string.
* Returns null if the LLM provider doesn't support embeddings.
*/
export async function embedText(text: string): Promise<number[] | null> {
const provider = llm();
if (!provider.embed) return null;
const trimmed = text.slice(0, 8000); // Most embedding models cap at ~8k tokens
const res = await provider.embed({
input: trimmed,
model: config.LLM_EMBEDDING_MODEL,
});
return res.embeddings[0] ?? null;
}
/**
* Compute cosine similarity between two embedding vectors.
* Returns a value between -1 and 1 (1 = identical).
*/
export function cosineSimilarity(a: number[], b: number[]): number {
if (a.length !== b.length || a.length === 0) return 0;
let dot = 0;
let magA = 0;
let magB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
magA += a[i] * a[i];
magB += b[i] * b[i];
}
const denom = Math.sqrt(magA) * Math.sqrt(magB);
if (denom === 0) return 0;
return dot / denom;
}
/**
* Strip HTML and normalize whitespace for embedding input.
*/
export function stripHtmlForEmbedding(html: string): string {
return html.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ').trim();
}

View File

@ -13,6 +13,10 @@ const registry = createFlagRegistry({
'copilot.enabled': true,
'chat.rag_enabled': true,
'onboarding.seed_enabled': true,
'notelett_auto_summarize_enabled': false,
'notelett_auto_embed_enabled': false,
'notelett_duplicate_check_enabled': true,
'notelett_suggest_links_enabled': true,
},
enabled: config.FEATURE_FLAGS_ENABLED,
});

View File

@ -0,0 +1,138 @@
/**
* Note lifecycle hooks background AI enrichment triggered after save.
*
* Runs non-blocking (fire-and-forget) so note save is never delayed.
* Gated behind feature flags.
*/
import { isFeatureEnabled } from './feature-flags.js';
import { embedText, stripHtmlForEmbedding } from './embeddings.js';
import { llm } from './llm.js';
import type { NoteDoc } from '../modules/notes/types.js';
import { getCollection } from './datastore.js';
import type { FastifyBaseLogger } from 'fastify';
const MIN_WORDS_FOR_SUMMARY = 300;
/**
* Run after a note is created or updated.
* Triggers background embedding + auto-summarize if enabled.
*/
export function runPostSaveHooks(
note: NoteDoc,
log: FastifyBaseLogger,
): void {
// Fire-and-forget — errors are logged, never thrown
void backgroundEmbed(note, log);
void backgroundAutoSummarize(note, log);
}
/**
* Compute and store embedding vector for the note.
*/
async function backgroundEmbed(
note: NoteDoc,
log: FastifyBaseLogger,
): Promise<void> {
if (!isFeatureEnabled('notelett_auto_embed_enabled')) return;
try {
const plainText = stripHtmlForEmbedding(note.body ?? '');
if (plainText.length < 20) return; // Too short to embed meaningfully
const embedding = await embedText(plainText);
if (!embedding) return;
// Update the note document with the embedding (don't overwrite other fields)
const col = getCollection<NoteDoc>('notes', '/workspaceId');
const existing = await col.findById(note.id, note.workspaceId);
if (!existing) return;
await col.upsert({ ...existing, embedding });
log.debug({ noteId: note.id }, 'note embedding computed');
} catch (err) {
log.warn({ noteId: note.id, err }, 'background embed failed');
}
}
/**
* Auto-generate a summary artifact for long notes.
*/
async function backgroundAutoSummarize(
note: NoteDoc,
log: FastifyBaseLogger,
): Promise<void> {
if (!isFeatureEnabled('notelett_auto_summarize_enabled')) return;
try {
const plainText = stripHtmlForEmbedding(note.body ?? '');
const wordCount = plainText.split(/\s+/).filter(Boolean).length;
if (wordCount < MIN_WORDS_FOR_SUMMARY) return;
// Skip if already has a summary
if (note.summaryArtifactId) return;
const provider = llm();
const result = await provider.chatCompletion({
messages: [
{ role: 'system', content: 'Create a concise summary (2-4 sentences) of the following note. Return only the summary.' },
{ role: 'user', content: plainText.slice(0, 8000) },
],
temperature: 0.3,
maxTokens: 512,
});
const summary = result.content.trim();
if (!summary) return;
// Store as artifact
const { createNoteArtifact } = await import('../modules/note-artifacts/repository.js');
const now = new Date().toISOString();
const artifact = await createNoteArtifact({
id: `summary-${note.id}-${Date.now()}`,
productId: note.productId,
workspaceId: note.workspaceId,
userId: note.userId,
noteId: note.id,
artifactType: 'summary',
title: `Auto-summary of ${note.title}`,
description: summary,
createdAt: now,
updatedAt: now,
createdBy: 'system',
updatedBy: 'system',
});
// Link artifact back to note
const col = getCollection<NoteDoc>('notes', '/workspaceId');
const existing = await col.findById(note.id, note.workspaceId);
if (existing) {
await col.upsert({ ...existing, summaryArtifactId: artifact.id });
}
// Record agent action
const { createNoteAgentAction } = await import('../modules/note-agent-actions/repository.js');
await createNoteAgentAction({
id: `auto-summary-${note.id}-${Date.now()}`,
productId: note.productId,
workspaceId: note.workspaceId,
userId: note.userId,
noteId: note.id,
actorId: 'system',
actorType: 'agent',
toolName: 'auto_summarize',
actionType: 'auto_enrich',
state: 'applied',
reason: `Auto-generated summary for note with ${wordCount} words`,
afterSummary: summary.slice(0, 200),
createdAt: now,
updatedAt: now,
createdBy: 'system',
updatedBy: 'system',
});
log.info({ noteId: note.id, artifactId: artifact.id }, 'auto-summary generated');
} catch (err) {
log.warn({ noteId: note.id, err }, 'background auto-summarize failed');
}
}

View File

@ -3,8 +3,11 @@
*/
import type { FastifyInstance } from 'fastify';
import { z } from 'zod';
import { getUserId, getRequestProductId } from '../../lib/request-context.js';
import { BadRequestError, NotFoundError } from '@bytelyst/errors';
import { embedText, cosineSimilarity, stripHtmlForEmbedding } from '../../lib/embeddings.js';
import { llm } from '../../lib/llm.js';
import {
CreatePromptTemplateSchema,
UpdatePromptTemplateSchema,
@ -121,4 +124,285 @@ export async function notePromptRoutes(app: FastifyInstance): Promise<void> {
return { wordCount, readingTimeMinutes };
});
// ── Suggest tags via LLM (F5) ──────────────────────────────────
app.post('/notes/:id/suggest-tags', async (req) => {
const userId = getUserId(req);
const productId = getRequestProductId(req);
const { id } = req.params as { id: string };
const { workspaceId } = req.body as { workspaceId: string };
if (!workspaceId) throw new BadRequestError('workspaceId required');
const note = await noteRepo.getNote(id, workspaceId);
if (!note || note.userId !== userId || note.productId !== productId) {
throw new NotFoundError('Note not found');
}
const plain = stripHtmlForEmbedding(note.body ?? '');
const provider = llm();
const result = await provider.chatCompletion({
messages: [
{ role: 'system', content: 'Suggest 3-5 tags for this note. Return ONLY a JSON array of lowercase tag strings, e.g. ["tag1","tag2"]. No other text.' },
{ role: 'user', content: `Title: ${note.title}\n\n${plain.slice(0, 4000)}` },
],
temperature: 0.3,
maxTokens: 128,
});
try {
const tags = JSON.parse(result.content.trim()) as string[];
return { tags: tags.filter((t) => typeof t === 'string').slice(0, 5) };
} catch {
return { tags: [] };
}
});
// ── Duplicate/similar note detection (F8) ───────────────────────
const CheckDuplicatesSchema = z.object({
workspaceId: z.string().min(1).max(128),
threshold: z.coerce.number().min(0).max(1).default(0.85),
limit: z.coerce.number().int().min(1).max(20).default(5),
});
app.post('/notes/:id/check-duplicates', async (req) => {
const userId = getUserId(req);
const productId = getRequestProductId(req);
const { id } = req.params as { id: string };
const input = CheckDuplicatesSchema.parse(req.body);
const note = await noteRepo.getNote(id, input.workspaceId);
if (!note || note.userId !== userId || note.productId !== productId) {
throw new NotFoundError('Note not found');
}
const plain = stripHtmlForEmbedding(note.body ?? '');
const noteEmbedding = await embedText(plain);
if (!noteEmbedding) {
return { duplicates: [], message: 'Embedding not available' };
}
// Fetch all notes in workspace
const { items: allNotes } = await noteRepo.listNotes(userId, productId, {
workspaceId: input.workspaceId,
limit: 100,
offset: 0,
});
const duplicates: Array<{ id: string; title: string; similarity: number }> = [];
for (const other of allNotes) {
if (other.id === id) continue;
let otherEmbedding = other.embedding;
if (!otherEmbedding) {
const otherPlain = stripHtmlForEmbedding(other.body ?? '');
if (otherPlain.length < 20) continue;
otherEmbedding = await embedText(otherPlain) ?? undefined;
}
if (!otherEmbedding) continue;
const similarity = cosineSimilarity(noteEmbedding, otherEmbedding);
if (similarity >= input.threshold) {
duplicates.push({ id: other.id, title: other.title, similarity: Math.round(similarity * 100) / 100 });
}
}
duplicates.sort((a, b) => b.similarity - a.similarity);
return { duplicates: duplicates.slice(0, input.limit) };
});
// ── Suggest related notes to link (F9) ──────────────────────────
const SuggestLinksSchema = z.object({
workspaceId: z.string().min(1).max(128),
threshold: z.coerce.number().min(0).max(1).default(0.6),
limit: z.coerce.number().int().min(1).max(10).default(5),
});
app.post('/notes/:id/suggest-links', async (req) => {
const userId = getUserId(req);
const productId = getRequestProductId(req);
const { id } = req.params as { id: string };
const input = SuggestLinksSchema.parse(req.body);
const note = await noteRepo.getNote(id, input.workspaceId);
if (!note || note.userId !== userId || note.productId !== productId) {
throw new NotFoundError('Note not found');
}
const plain = stripHtmlForEmbedding(note.body ?? '');
const noteEmbedding = await embedText(plain);
if (!noteEmbedding) {
return { suggestions: [], message: 'Embedding not available' };
}
const { items: allNotes } = await noteRepo.listNotes(userId, productId, {
workspaceId: input.workspaceId,
limit: 100,
offset: 0,
});
// Exclude already-linked notes
const existingLinks = new Set(note.links ?? []);
const suggestions: Array<{ id: string; title: string; similarity: number }> = [];
for (const other of allNotes) {
if (other.id === id || existingLinks.has(other.id)) continue;
let otherEmbedding = other.embedding;
if (!otherEmbedding) {
const otherPlain = stripHtmlForEmbedding(other.body ?? '');
if (otherPlain.length < 20) continue;
otherEmbedding = await embedText(otherPlain) ?? undefined;
}
if (!otherEmbedding) continue;
const similarity = cosineSimilarity(noteEmbedding, otherEmbedding);
if (similarity >= input.threshold) {
suggestions.push({ id: other.id, title: other.title, similarity: Math.round(similarity * 100) / 100 });
}
}
suggestions.sort((a, b) => b.similarity - a.similarity);
return { suggestions: suggestions.slice(0, input.limit) };
});
// ── Knowledge gap detection (F12) ───────────────────────────────
app.post('/workspaces/:wsId/knowledge-gaps', async (req) => {
const userId = getUserId(req);
const productId = getRequestProductId(req);
const { wsId } = req.params as { wsId: string };
const { items: notes } = await noteRepo.listNotes(userId, productId, {
workspaceId: wsId,
limit: 100,
offset: 0,
});
if (notes.length === 0) {
return { gaps: [], topicMap: {} };
}
// Build topic summary from note titles and tags
const topicLines = notes.map((n) => {
const tags = (n.tags ?? []).join(', ');
return `- "${n.title}"${tags ? ` [tags: ${tags}]` : ''}`;
}).join('\n');
const provider = llm();
const result = await provider.chatCompletion({
messages: [
{
role: 'system',
content: `You analyze a workspace of notes and identify knowledge gaps.
Return a JSON object with:
- "gaps": array of { "topic": string, "description": string, "suggestedTitle": string }
- "topicMap": object mapping topic names to counts
Return ONLY valid JSON, no other text.`,
},
{
role: 'user',
content: `This workspace has ${notes.length} notes:\n${topicLines}\n\nIdentify 3-5 topics that are mentioned but under-covered, or important related topics that are missing entirely.`,
},
],
temperature: 0.5,
maxTokens: 1024,
});
try {
const parsed = JSON.parse(result.content.trim());
return {
gaps: parsed.gaps ?? [],
topicMap: parsed.topicMap ?? {},
};
} catch {
return { gaps: [], topicMap: {}, raw: result.content };
}
});
// ── Compare notes (F14) ─────────────────────────────────────────
const CompareNotesSchema = z.object({
noteIds: z.array(z.string().min(1)).min(2).max(5),
workspaceId: z.string().min(1).max(128),
});
app.post('/notes/compare', async (req) => {
const userId = getUserId(req);
const productId = getRequestProductId(req);
const input = CompareNotesSchema.parse(req.body);
const notes = await Promise.all(
input.noteIds.map((nid) => noteRepo.getNote(nid, input.workspaceId)),
);
const validNotes = notes.filter((n) => n && n.userId === userId && n.productId === productId);
if (validNotes.length < 2) {
throw new BadRequestError('Need at least 2 accessible notes to compare');
}
const noteSummaries = validNotes.map((n) => {
const plain = stripHtmlForEmbedding(n!.body ?? '').slice(0, 2000);
return `## ${n!.title}\n${plain}`;
}).join('\n\n---\n\n');
const provider = llm();
const result = await provider.chatCompletion({
messages: [
{ role: 'system', content: 'Compare the following notes. Identify similarities, differences, contradictions, and complementary information. Structure your response with clear headings.' },
{ role: 'user', content: noteSummaries },
],
temperature: 0.4,
maxTokens: 2048,
});
return {
content: result.content,
model: result.model,
usage: result.usage,
noteCount: validNotes.length,
};
});
// ── Merge notes (F13) ──────────────────────────────────────────
const MergeNotesSchema = z.object({
noteIds: z.array(z.string().min(1)).min(2).max(10),
workspaceId: z.string().min(1).max(128),
});
app.post('/notes/merge', async (req) => {
const userId = getUserId(req);
const productId = getRequestProductId(req);
const input = MergeNotesSchema.parse(req.body);
const notes = await Promise.all(
input.noteIds.map((nid) => noteRepo.getNote(nid, input.workspaceId)),
);
const validNotes = notes.filter((n) => n && n.userId === userId && n.productId === productId);
if (validNotes.length < 2) {
throw new BadRequestError('Need at least 2 accessible notes to merge');
}
const noteContents = validNotes.map((n) => {
const plain = stripHtmlForEmbedding(n!.body ?? '').slice(0, 3000);
return `## ${n!.title}\n${plain}`;
}).join('\n\n---\n\n');
const provider = llm();
const result = await provider.chatCompletion({
messages: [
{ role: 'system', content: 'Merge the following notes into a single coherent document. Combine information logically, remove redundancy, preserve all unique facts. Use clear headings and structure.' },
{ role: 'user', content: noteContents },
],
temperature: 0.3,
maxTokens: 4096,
});
return {
content: result.content,
model: result.model,
usage: result.usage,
sourceNoteIds: validNotes.map((n) => n!.id),
};
});
}

View File

@ -20,6 +20,8 @@ export interface NoteDoc {
createdBy: string;
updatedBy: string;
agentId?: string;
embedding?: number[];
summaryArtifactId?: string;
_ts?: number;
_etag?: string;
}