learning_ai_common_plat/packages/palace/src/extraction.ts

/**
 * Memory extraction utilities — prompt building, response parsing, regex fallback.
 *
 * Products call their own LLM provider with the prompt from buildExtractionPrompt(),
 * then pass the response to parseExtractionResponse().
 * If LLM is unavailable, regexFallbackExtraction() provides basic extraction.
 */

import type { ExtractedMemory } from './types.js';

export interface ExtractionContext {
  title?: string;
  context?: string;
  hallTypes: readonly string[];
}

/**
 * Build a structured extraction prompt for an LLM.
 *
 * @param content - The text content to extract memories from
 * @param ctx - Context including title, additional context, and allowed hall types
 * @returns A system/user prompt string ready for LLM chat()
 */
export function buildExtractionPrompt(content: string, ctx: ExtractionContext): string {
  const hallList = ctx.hallTypes.join(', ');
  const titleLine = ctx.title ? `\nTitle: ${ctx.title}` : '';
  const contextLine = ctx.context ? `\nContext: ${ctx.context}` : '';

  return `Extract structured memories from the following content.

For each distinct memory, return a JSON array where each element has:
- "hall": one of [${hallList}]
- "content": the memory summarized in 1-2 sentences
- "roomSlug": a short kebab-case topic slug (e.g. "auth-migration", "api-design")
- "entities": array of named entities mentioned (people, projects, technologies, places)

Rules:
- Only extract genuinely important or referenceable facts, decisions, or events
- Skip trivial or obvious statements
- Each memory should be self-contained (understandable without the original context)
- Prefer specific details over vague summaries
- Return valid JSON only — no markdown fences, no explanation${titleLine}${contextLine}

Content:
${content}`;
}

/**
 * Parse an LLM extraction response into ExtractedMemory[].
 *
 * Handles:
 * - Clean JSON arrays
 * - JSON wrapped in markdown code fences
 * - Malformed JSON (returns empty array)
 */
export function parseExtractionResponse(llmOutput: string): ExtractedMemory[] {
  if (!llmOutput || llmOutput.trim().length === 0) return [];

  let cleaned = llmOutput.trim();

  // Strip markdown code fences if present
  if (cleaned.startsWith('```')) {
    cleaned = cleaned.replace(/^```(?:json)?\s*\n?/, '').replace(/\n?```\s*$/, '');
  }

  try {
    const parsed = JSON.parse(cleaned);

    if (!Array.isArray(parsed)) return [];

    return parsed
      .filter(
        (item: unknown): item is Record<string, unknown> =>
          typeof item === 'object' && item !== null && 'hall' in item && 'content' in item
      )
      .map(item => ({
        hall: String(item.hall || ''),
        content: String(item.content || ''),
        roomSlug: String(item.roomSlug || item.room_slug || 'general'),
        entities: Array.isArray(item.entities) ? item.entities.map(String) : [],
      }));
  } catch {
    return [];
  }
}

/**
 * Regex-based fallback extraction when LLM is unavailable.
 *
 * Scans for common patterns:
 * - "Decision:" / "Decided:" → decisions
 * - "TODO:" / "Action:" → decisions
 * - "Found:" / "Discovered:" / "Learned:" → discoveries
 * - "Prefer:" / "Always:" / "Never:" → preferences
 * - "Event:" / "Happened:" / date patterns → events
 * - "Tip:" / "Note:" / "Remember:" → advice
 *
 * @param content - Raw text content
 * @returns Array of extracted memories (best-effort)
 */
export function regexFallbackExtraction(content: string): ExtractedMemory[] {
  const memories: ExtractedMemory[] = [];
  const lines = content.split('\n');

  const patterns: Array<{ regex: RegExp; hall: string }> = [
    { regex: /^(?:decision|decided|resolve[ds]?):\s*(.+)/i, hall: 'decisions' },
    { regex: /^(?:todo|action|task):\s*(.+)/i, hall: 'decisions' },
    { regex: /^(?:found|discovered|learned|til):\s*(.+)/i, hall: 'discoveries' },
    { regex: /^(?:prefer|always|never):\s*(.+)/i, hall: 'preferences' },
    { regex: /^(?:event|happened|occurred):\s*(.+)/i, hall: 'events' },
    { regex: /^(?:tip|note|remember|important):\s*(.+)/i, hall: 'advice' },
    { regex: /^(?:error|bug|issue|broken):\s*(.+)/i, hall: 'errors' },
    { regex: /^(?:pattern|recurring|trend):\s*(.+)/i, hall: 'patterns' },
    { regex: /^(?:feeling|mood|emotion):\s*(.+)/i, hall: 'emotions' },
    { regex: /^(?:insight|observation|noticed):\s*(.+)/i, hall: 'insights' },
  ];

  for (const line of lines) {
    const trimmed = line.replace(/^[\s\-*>#]+/, '').trim();
    if (!trimmed) continue;

    for (const { regex, hall } of patterns) {
      const match = trimmed.match(regex);
      if (match && match[1]) {
        memories.push({
          hall,
          content: match[1].trim(),
          roomSlug: 'general',
          entities: extractEntities(match[1]),
        });
        break;
      }
    }
  }

  return memories;
}

/**
 * Extract simple entities from text (mentions, tags, capitalized phrases).
 */
function extractEntities(text: string): string[] {
  const entities = new Set<string>();

  // @mentions
  const mentions = text.match(/@(\w+)/g);
  if (mentions) mentions.forEach(m => entities.add(m.slice(1)));

  // #tags
  const tags = text.match(/#(\w+)/g);
  if (tags) tags.forEach(t => entities.add(t.slice(1)));

  return Array.from(entities);
}