learning_ai_notes/backend/src/modules/intake/routes.ts

358 lines
11 KiB
TypeScript

/**
* Intake routes — URL intake pipeline + rules CRUD + job status.
*/
import type { FastifyInstance } from 'fastify';
import { randomUUID } from 'node:crypto';
import { getUserId, getRequestProductId } from '../../lib/request-context.js';
import { BadRequestError, NotFoundError } from '@bytelyst/errors';
import { isFeatureEnabled } from '../../lib/feature-flags.js';
import { trackEvent } from '../../lib/telemetry.js';
import { PRODUCT_ID } from '../../lib/product-config.js';
import { classifyUrl } from './url-classifier.js';
import { extractContent } from './extractors.js';
import { getBuiltinIntakeRules } from './seed-rules.js';
import * as repo from './repository.js';
import * as noteRepo from '../notes/repository.js';
import { executePrompt } from '../note-prompts/runner.js';
import * as promptRepo from '../note-prompts/repository.js';
import { stripHtmlForEmbedding } from '../../lib/embeddings.js';
import {
IntakeRequestSchema,
CreateIntakeRuleSchema,
UpdateIntakeRuleSchema,
ListIntakeJobsQuerySchema,
} from './types.js';
import type { IntakeRuleDoc, IntakeJobStatus } from './types.js';
// ── Rate limiter (simple in-memory) ──────────────────────────────
const rateLimitMap = new Map<string, number[]>();
const RATE_LIMIT_WINDOW_MS = 3600_000;
const RATE_LIMIT_MAX = 20;
function checkRateLimit(userId: string): void {
const now = Date.now();
const timestamps = rateLimitMap.get(userId) ?? [];
const recent = timestamps.filter((t) => now - t < RATE_LIMIT_WINDOW_MS);
if (recent.length >= RATE_LIMIT_MAX) {
throw new BadRequestError(`Rate limit exceeded: max ${RATE_LIMIT_MAX} intakes per hour`);
}
recent.push(now);
rateLimitMap.set(userId, recent);
}
// ── Helpers ──────────────────────────────────────────────────────
async function matchIntakeRule(
url: string,
userId: string,
productId: string,
): Promise<IntakeRuleDoc | null> {
const rules = await repo.listIntakeRules(userId, productId);
// Also include built-in rules that may not be persisted yet
const builtinRules = getBuiltinIntakeRules().map((r) => ({
...r,
createdAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
}));
const allRules = [...rules];
// Add built-in rules not already in the list
for (const br of builtinRules) {
if (!allRules.some((r) => r.id === br.id)) {
allRules.push(br);
}
}
// Sort by priority (lower = higher priority)
allRules.sort((a, b) => a.priority - b.priority);
for (const rule of allRules) {
if (!rule.enabled) continue;
try {
const regex = new RegExp(rule.urlPattern, 'i');
if (regex.test(url)) return rule;
} catch {
// Invalid regex in rule — skip
}
}
return null;
}
async function runIntakeBackground(
jobId: string,
userId: string,
url: string,
contentType: string,
templateSlug: string,
noteId: string,
workspaceId: string,
): Promise<void> {
try {
// Phase 1: Extract content
await repo.updateIntakeJob(jobId, userId, { status: 'extracting' });
const extracted = await extractContent(url, contentType as import('./types.js').IntakeContentType);
await repo.updateIntakeJob(jobId, userId, {
status: 'processing',
extractedText: extracted.text.slice(0, 10_000),
});
// Phase 2: Run prompt template
const template = await promptRepo.getPromptTemplate(templateSlug, userId);
if (!template) {
// Template not found — save raw extracted content to note
await noteRepo.updateNote(noteId, workspaceId, {
title: extracted.title || url,
body: `<p>${extracted.text.replace(/\n/g, '</p><p>')}</p>`,
status: 'active',
});
await repo.updateIntakeJob(jobId, userId, {
status: 'complete',
completedAt: new Date().toISOString(),
});
return;
}
const result = await executePrompt(template, {
templateId: templateSlug,
noteId,
workspaceId,
}, extracted.text);
// Phase 3: Update note with LLM result
await noteRepo.updateNote(noteId, workspaceId, {
title: extracted.title || url,
body: `<p>${result.content.replace(/\n/g, '</p><p>')}</p>`,
status: 'active',
});
await repo.updateIntakeJob(jobId, userId, {
status: 'complete',
completedAt: new Date().toISOString(),
});
trackEvent('intake_job_completed', userId, {
contentType,
templateSlug,
url: new URL(url).hostname,
});
} catch (err) {
const errorMsg = err instanceof Error ? err.message : 'Unknown error';
await repo.updateIntakeJob(jobId, userId, {
status: 'failed',
error: errorMsg,
completedAt: new Date().toISOString(),
}).catch(() => {});
trackEvent('intake_job_failed', userId, { contentType, error: errorMsg });
}
}
// ── Route Plugin ─────────────────────────────────────────────────
export async function intakeRoutes(app: FastifyInstance): Promise<void> {
// ── POST /intake — main intake endpoint ────────────────────────
app.post('/intake', async (req, reply) => {
const userId = getUserId(req);
const productId = getRequestProductId(req);
if (!isFeatureEnabled('notelett_intake_enabled')) {
throw new BadRequestError('Intake feature is not enabled');
}
checkRateLimit(userId);
const parsed = IntakeRequestSchema.safeParse(req.body);
if (!parsed.success) {
throw new BadRequestError(parsed.error.issues.map((i: { message: string }) => i.message).join('; '));
}
const input = parsed.data;
const workspaceId = input.workspaceId || 'default';
// Classify URL
const classification = classifyUrl(input.url);
// Match intake rule
const rule = await matchIntakeRule(input.url, userId, productId);
const templateSlug = input.templateOverride || rule?.templateId || 'article-summary';
// Create draft note
const noteId = `note_intake_${randomUUID().replace(/-/g, '').slice(0, 12)}`;
const now = new Date().toISOString();
await noteRepo.createNote({
id: noteId,
productId: PRODUCT_ID,
workspaceId,
userId,
title: `Processing: ${new URL(input.url).hostname}`,
body: '<p>Processing URL content...</p>',
status: 'draft',
tags: ['intake', classification.contentType],
links: [],
sourceType: 'intake',
sourceUri: input.url,
createdAt: now,
updatedAt: now,
createdBy: userId,
updatedBy: userId,
agentId: 'intake-pipeline',
});
// Create intake job
const jobId = `job_${randomUUID().replace(/-/g, '').slice(0, 12)}`;
await repo.createIntakeJob({
id: jobId,
productId: PRODUCT_ID,
userId,
workspaceId,
noteId,
ruleId: rule?.id || '__auto__',
url: input.url,
contentType: classification.contentType,
templateSlug,
status: 'queued',
startedAt: now,
});
// Fire background processing (no await)
setImmediate(() => {
void runIntakeBackground(
jobId, userId, input.url,
classification.contentType, templateSlug,
noteId, workspaceId,
);
});
trackEvent('intake_submitted', userId, {
contentType: classification.contentType,
templateSlug,
domain: new URL(input.url).hostname,
});
reply.code(202);
return {
jobId,
noteId,
contentType: classification.contentType,
ruleMatched: rule?.name || null,
templateSlug,
status: 'queued',
};
});
// ── GET /intake/jobs — list intake jobs ────────────────────────
app.get('/intake/jobs', async (req) => {
const userId = getUserId(req);
const productId = getRequestProductId(req);
const query = ListIntakeJobsQuerySchema.parse(req.query);
// Support comma-separated statuses (e.g. "queued,extracting,processing")
const statuses = query.status
? query.status.split(',').map((s) => s.trim()).filter(Boolean) as IntakeJobStatus[]
: undefined;
const jobs = await repo.listIntakeJobs(userId, productId, {
statuses,
since: query.since,
limit: query.limit,
offset: query.offset,
});
return { items: jobs, total: jobs.length };
});
// ── GET /intake/jobs/:id — single job status ──────────────────
app.get('/intake/jobs/:id', async (req) => {
const userId = getUserId(req);
const { id } = req.params as { id: string };
const job = await repo.getIntakeJob(id, userId);
if (!job) throw new NotFoundError('Intake job not found');
return job;
});
// ── Intake Rules CRUD ─────────────────────────────────────────
app.get('/intake-rules', async (req) => {
const userId = getUserId(req);
const productId = getRequestProductId(req);
const rules = await repo.listIntakeRules(userId, productId);
// Merge in built-in rules that aren't persisted
const builtinRules = getBuiltinIntakeRules().map((r) => ({
...r,
createdAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
}));
const all = [...rules];
for (const br of builtinRules) {
if (!all.some((r) => r.id === br.id)) {
all.push(br);
}
}
all.sort((a, b) => a.priority - b.priority);
return { items: all, total: all.length };
});
app.post('/intake-rules', async (req, reply) => {
const userId = getUserId(req);
const productId = getRequestProductId(req);
const input = CreateIntakeRuleSchema.parse(req.body);
const now = new Date().toISOString();
// Validate regex
try {
new RegExp(input.urlPattern);
} catch {
throw new BadRequestError('Invalid URL pattern (must be valid regex)');
}
const doc: IntakeRuleDoc = {
id: `rule_${randomUUID().replace(/-/g, '').slice(0, 12)}`,
productId,
userId,
...input,
createdAt: now,
updatedAt: now,
};
const created = await repo.createIntakeRule(doc);
reply.code(201);
return created;
});
app.patch('/intake-rules/:id', async (req) => {
const userId = getUserId(req);
const { id } = req.params as { id: string };
const existing = await repo.getIntakeRule(id, userId);
if (!existing) throw new NotFoundError('Intake rule not found');
if (existing.userId === '__builtin__') {
throw new BadRequestError('Cannot modify built-in intake rules');
}
const updates = UpdateIntakeRuleSchema.parse(req.body);
if (updates.urlPattern) {
try { new RegExp(updates.urlPattern); } catch { throw new BadRequestError('Invalid URL pattern'); }
}
return repo.updateIntakeRule(id, userId, {
...updates,
updatedAt: new Date().toISOString(),
});
});
app.delete('/intake-rules/:id', async (req, reply) => {
const userId = getUserId(req);
const { id } = req.params as { id: string };
const existing = await repo.getIntakeRule(id, userId);
if (!existing) throw new NotFoundError('Intake rule not found');
if (existing.userId === '__builtin__') {
throw new BadRequestError('Cannot delete built-in intake rules');
}
await repo.deleteIntakeRule(id, userId);
reply.code(204);
});
}