413 lines
14 KiB
JavaScript
413 lines
14 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
/**
|
||
* lesson-extract-worker (example)
|
||
*
|
||
* Watches a repo-backed inbox for lesson extraction tasks and processes them asynchronously:
|
||
* - reads session JSONL transcript (streaming)
|
||
* - Map: per-chunk extraction via Gemini (native API)
|
||
* - Reduce: merge/dedupe/score -> 0..20 lessons
|
||
* - writes to LanceDB Pro via `openclaw memory-pro import`
|
||
* - sends Telegram notification via `openclaw message send` (optional)
|
||
*/
|
||
|
||
import fs from "node:fs";
|
||
import fsp from "node:fs/promises";
|
||
import path from "node:path";
|
||
import { fileURLToPath } from "node:url";
|
||
import { spawn } from "node:child_process";
|
||
import readline from "node:readline";
|
||
|
||
const __filename = fileURLToPath(import.meta.url);
|
||
const __dirname = path.dirname(__filename);
|
||
|
||
// In your deployment, set LESSON_QUEUE_ROOT to your workspace queue.
|
||
// By default we assume repo layout similar to OpenClaw-Memory.
|
||
const REPO_ROOT = path.resolve(__dirname, "..", "..", "..", "..", "..");
|
||
const QUEUE_ROOT = process.env.LESSON_QUEUE_ROOT || path.join(REPO_ROOT, "workspaces", "main", "tasks", "lesson-extract");
|
||
|
||
const INBOX = path.join(QUEUE_ROOT, "inbox");
|
||
const PROCESSING = path.join(QUEUE_ROOT, "processing");
|
||
const DONE = path.join(QUEUE_ROOT, "done");
|
||
const ERROR = path.join(QUEUE_ROOT, "error");
|
||
|
||
const GEMINI_API_KEY = process.env.GEMINI_API_KEY;
|
||
const GEMINI_MODEL = process.env.GEMINI_MODEL || "gemini-3-flash-preview";
|
||
|
||
const ONCE = process.argv.includes("--once");
|
||
|
||
function ensureDirs() {
|
||
for (const d of [INBOX, PROCESSING, DONE, ERROR]) {
|
||
fs.mkdirSync(d, { recursive: true });
|
||
}
|
||
}
|
||
|
||
function nowIso() {
|
||
return new Date().toISOString();
|
||
}
|
||
|
||
function run(cmd, args, opts = {}) {
|
||
return new Promise((resolve) => {
|
||
const child = spawn(cmd, args, { ...opts, stdio: ["ignore", "pipe", "pipe"] });
|
||
let out = "";
|
||
let err = "";
|
||
child.stdout.on("data", (d) => (out += d.toString("utf-8")));
|
||
child.stderr.on("data", (d) => (err += d.toString("utf-8")));
|
||
child.on("close", (code) => resolve({ code: code ?? 0, out, err }));
|
||
});
|
||
}
|
||
|
||
function safeJsonParse(text) {
|
||
try {
|
||
return JSON.parse(text);
|
||
} catch {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
function normalizeText(s) {
|
||
return (s || "")
|
||
.trim()
|
||
.replace(/\s+/g, " ")
|
||
.replace(/[“”]/g, '"')
|
||
.replace(/[‘’]/g, "'")
|
||
.toLowerCase();
|
||
}
|
||
|
||
function detectLang(text) {
|
||
const s = text || "";
|
||
const cjk = (s.match(/[\u4e00-\u9fff]/g) || []).length;
|
||
const latin = (s.match(/[A-Za-z]/g) || []).length;
|
||
if (cjk > latin * 0.8) return "zh";
|
||
if (latin > cjk * 0.8) return "en";
|
||
return "mixed";
|
||
}
|
||
|
||
async function* iterJsonlMessages(sessionFile) {
|
||
const stream = fs.createReadStream(sessionFile, { encoding: "utf-8" });
|
||
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
|
||
|
||
let id = 0;
|
||
for await (const line of rl) {
|
||
if (!line) continue;
|
||
const obj = safeJsonParse(line);
|
||
if (!obj || obj.type !== "message") continue;
|
||
const m = obj.message;
|
||
if (!m || (m.role !== "user" && m.role !== "assistant")) continue;
|
||
|
||
let text = "";
|
||
if (typeof m.content === "string") {
|
||
text = m.content;
|
||
} else if (Array.isArray(m.content)) {
|
||
text = m.content
|
||
.filter((c) => c && c.type === "text" && c.text)
|
||
.map((c) => c.text)
|
||
.join("\n");
|
||
}
|
||
text = (text || "").trim();
|
||
if (!text) continue;
|
||
|
||
id++;
|
||
yield {
|
||
id,
|
||
role: m.role,
|
||
timestamp: obj.timestamp || "",
|
||
text,
|
||
};
|
||
}
|
||
}
|
||
|
||
async function buildChunksFromJsonl(sessionFile, { maxChars = 12000, overlapMsgs = 10, maxChunks = 200 } = {}) {
|
||
const chunks = [];
|
||
let chunk = [];
|
||
let size = 0;
|
||
|
||
const sampleTexts = [];
|
||
|
||
for await (const m of iterJsonlMessages(sessionFile)) {
|
||
if (sampleTexts.length < 200) sampleTexts.push(m.text);
|
||
|
||
const line = `[${m.role === "user" ? "U" : "A"}${m.id}] ${m.text}\n`;
|
||
|
||
if (size + line.length > maxChars && chunk.length > 0) {
|
||
chunks.push(chunk);
|
||
if (chunks.length >= maxChunks) break;
|
||
|
||
chunk = chunk.slice(Math.max(0, chunk.length - overlapMsgs));
|
||
size = chunk.reduce((acc, mm) => acc + (`[${mm.role === "user" ? "U" : "A"}${mm.id}] ${mm.text}\n`).length, 0);
|
||
}
|
||
|
||
chunk.push(m);
|
||
size += line.length;
|
||
}
|
||
|
||
if (chunk.length > 0 && chunks.length < maxChunks) chunks.push(chunk);
|
||
|
||
const lang = detectLang(sampleTexts.join("\n"));
|
||
const messageCount = chunk.length === 0 && chunks.length === 0 ? 0 : chunks[chunks.length - 1][chunks[chunks.length - 1].length - 1].id;
|
||
|
||
return { chunks, lang, messageCount };
|
||
}
|
||
|
||
function buildMapPrompt({ lang, chunk }) {
|
||
const langInstr = lang === "zh" ? "请用中文输出 lessons。" : lang === "en" ? "Output lessons in English." : "Follow the dominant language of the transcript.";
|
||
|
||
return `You are extracting high-signal technical lessons from a chat transcript chunk.\n\nRules:\n- Output STRICT JSON only. No markdown, no backticks.\n- If nothing valuable, output: {\"lessons\":[]}\n- Max 8 lessons.\n- Each lesson.text must be <= 480 characters.\n- Categories: fact | decision | preference | other (use fact/decision primarily).\n- importance: number 0..1 (high-signal: 0.8-0.95).\n- evidence MUST quote exact short snippets from the chunk and include message_ids.\n- Do NOT include secrets/tokens/credentials.\n- Add Keywords (zh) inside each lesson:\n - Include >=1 Entity keyword that appears verbatim in the chunk (project/library/tool/service/config key/error code).\n - Include >=1 Action keyword (e.g., 修复/回滚/重启/迁移/去重/限流).\n - Include >=1 Symptom keyword (e.g., OOM/超时/429/重复/命中率差).\n - Do NOT invent entity names; copy entity keywords from the chunk.\n\n${langInstr}\n\nChunk:\n${chunk.map((m) => `[${m.role === "user" ? "U" : "A"}${m.id}] ${m.text}`).join("\n\n")}\n\nReturn JSON schema:\n{\n \"lessons\": [\n {\n \"category\": \"fact\",\n \"importance\": 0.8,\n \"text\": \"Pitfall: ... Cause: ... Fix: ... Prevention: ...\",\n \"evidence\": [\n {\"message_ids\":[12,13],\"quote\":\"...\"}\n ],\n \"tags\": [\"optional\"]\n }\n ]\n}`;
|
||
}
|
||
|
||
async function geminiGenerateJson(prompt) {
|
||
if (!GEMINI_API_KEY) throw new Error("GEMINI_API_KEY is not set");
|
||
|
||
const url = `https://generativelanguage.googleapis.com/v1beta/models/${GEMINI_MODEL}:generateContent?key=${GEMINI_API_KEY}`;
|
||
|
||
const body = {
|
||
contents: [{ role: "user", parts: [{ text: prompt }] }],
|
||
generationConfig: {
|
||
temperature: 0.2,
|
||
maxOutputTokens: 4096,
|
||
},
|
||
};
|
||
|
||
const res = await fetch(url, {
|
||
method: "POST",
|
||
headers: { "Content-Type": "application/json" },
|
||
body: JSON.stringify(body),
|
||
});
|
||
|
||
const json = await res.json();
|
||
if (!res.ok) {
|
||
throw new Error(`Gemini error ${res.status}: ${JSON.stringify(json).slice(0, 500)}`);
|
||
}
|
||
|
||
const text = json?.candidates?.[0]?.content?.parts?.map((p) => p.text).join("") || "";
|
||
return text;
|
||
}
|
||
|
||
function coerceLessons(obj) {
|
||
const lessons = Array.isArray(obj?.lessons) ? obj.lessons : [];
|
||
return lessons
|
||
.filter((l) => l && typeof l.text === "string" && l.text.trim().length >= 10)
|
||
.map((l) => ({
|
||
category: ["fact", "decision", "preference", "other"].includes(l.category) ? l.category : "other",
|
||
importance: typeof l.importance === "number" ? l.importance : 0.7,
|
||
text: l.text.trim().slice(0, 480),
|
||
evidence: Array.isArray(l.evidence) ? l.evidence : [],
|
||
tags: Array.isArray(l.tags) ? l.tags : [],
|
||
}));
|
||
}
|
||
|
||
function scoreLesson(l) {
|
||
let s = 0;
|
||
const t = l.text || "";
|
||
if (/pitfall\s*:|cause\s*:|fix\s*:|prevention\s*:/i.test(t)) s += 2;
|
||
if (/decision principle|trigger\s*:|action\s*:/i.test(t)) s += 2;
|
||
if (/\b(openclaw|docker|systemd|ssh|git|api|json|yaml|config)\b/i.test(t)) s += 1;
|
||
if (t.length < 120) s += 0.5;
|
||
if (l.evidence?.length >= 1) s += 1;
|
||
if (l.evidence?.length >= 2) s += 0.5;
|
||
const imp = Math.max(0, Math.min(1, l.importance ?? 0.7));
|
||
s += imp;
|
||
return s;
|
||
}
|
||
|
||
function reduceLessons(allLessons, maxFinal = 20) {
|
||
const seen = new Set();
|
||
const merged = [];
|
||
|
||
for (const l of allLessons) {
|
||
const key = normalizeText(l.text);
|
||
if (!key) continue;
|
||
if (seen.has(key)) continue;
|
||
seen.add(key);
|
||
merged.push(l);
|
||
}
|
||
|
||
merged.sort((a, b) => scoreLesson(b) - scoreLesson(a));
|
||
|
||
const filtered = merged.filter((l) => {
|
||
if (!l.evidence || l.evidence.length === 0) return false;
|
||
const t = normalizeText(l.text);
|
||
if (t.length < 20) return false;
|
||
if (/(be careful|best practice|should|建议|注意)/.test(t) && !/(cause|fix|prevention|trigger|action|原因|修复|预防|触发)/.test(t)) {
|
||
return false;
|
||
}
|
||
return true;
|
||
});
|
||
|
||
return filtered.slice(0, maxFinal);
|
||
}
|
||
|
||
async function importToLanceDb({ lessons, scope }) {
|
||
const importFile = path.join("/tmp", `lesson-import-${Date.now()}.json`);
|
||
const payload = {
|
||
memories: lessons.map((l) => ({
|
||
text: l.text,
|
||
importance: Math.max(0.0, Math.min(1.0, l.importance ?? 0.7)),
|
||
category: l.category,
|
||
})),
|
||
};
|
||
await fsp.writeFile(importFile, JSON.stringify(payload), "utf-8");
|
||
|
||
const { code, out, err } = await run("openclaw", ["memory-pro", "import", importFile, "--scope", scope], { cwd: REPO_ROOT });
|
||
await fsp.unlink(importFile).catch(() => {});
|
||
|
||
return { code, out, err };
|
||
}
|
||
|
||
async function notifyTelegram(chatId, message) {
|
||
const args = ["message", "send", "--channel", "telegram", "--target", String(chatId), "--message", message];
|
||
await run("openclaw", args, { cwd: REPO_ROOT });
|
||
}
|
||
|
||
async function processTaskFile(taskPath) {
|
||
const started = Date.now();
|
||
const taskRaw = await fsp.readFile(taskPath, "utf-8");
|
||
const task = JSON.parse(taskRaw);
|
||
|
||
const baseName = path.basename(taskPath);
|
||
const processingPath = path.join(PROCESSING, baseName);
|
||
await fsp.rename(taskPath, processingPath);
|
||
|
||
const result = {
|
||
taskId: task.taskId,
|
||
startedAt: nowIso(),
|
||
finishedAt: null,
|
||
ok: false,
|
||
sessionId: task.session?.sessionId,
|
||
sessionFile: task.session?.sessionFile,
|
||
stats: {},
|
||
error: null,
|
||
};
|
||
|
||
try {
|
||
const sessionFile = task.session?.sessionFile;
|
||
if (!sessionFile || !fs.existsSync(sessionFile)) {
|
||
throw new Error(`sessionFile missing or not found: ${sessionFile}`);
|
||
}
|
||
|
||
const { chunks, lang, messageCount } = await buildChunksFromJsonl(sessionFile, {
|
||
maxChars: task.extract?.mapChunkChars ?? 12000,
|
||
overlapMsgs: task.extract?.mapOverlapMsgs ?? 10,
|
||
maxChunks: 200,
|
||
});
|
||
|
||
const allLessons = [];
|
||
let mapErrors = 0;
|
||
|
||
for (let idx = 0; idx < chunks.length; idx++) {
|
||
const prompt = buildMapPrompt({ lang, chunk: chunks[idx] });
|
||
try {
|
||
const text = await geminiGenerateJson(prompt);
|
||
const obj = safeJsonParse(text);
|
||
if (!obj) {
|
||
mapErrors++;
|
||
continue;
|
||
}
|
||
const lessons = coerceLessons(obj);
|
||
for (const l of lessons) allLessons.push(l);
|
||
} catch {
|
||
mapErrors++;
|
||
}
|
||
}
|
||
|
||
const reduced = reduceLessons(allLessons, task.extract?.maxFinal ?? 20);
|
||
|
||
const scope = task.scope || "agent:main";
|
||
const importRes = await importToLanceDb({ lessons: reduced, scope });
|
||
|
||
const durationMs = Date.now() - started;
|
||
result.ok = importRes.code === 0;
|
||
result.finishedAt = nowIso();
|
||
result.stats = {
|
||
lang,
|
||
messages: messageCount,
|
||
chunks: chunks.length,
|
||
mapCandidates: allLessons.length,
|
||
mapErrors,
|
||
reduced: reduced.length,
|
||
importCode: importRes.code,
|
||
durationMs,
|
||
};
|
||
|
||
const notifyChatId = task.notify?.telegramChatId;
|
||
if (notifyChatId) {
|
||
const text = [
|
||
`Lesson Extract ✅ (${task.agentId || "main"})`,
|
||
`taskId: ${task.taskId?.slice(0, 8) || "unknown"}`,
|
||
`sessionId: ${task.session?.sessionId || "unknown"}`,
|
||
`lang: ${lang}`,
|
||
`messages: ${messageCount}, chunks: ${chunks.length}`,
|
||
`candidates: ${allLessons.length}, reduced: ${reduced.length}`,
|
||
`import: code=${importRes.code}`,
|
||
`time: ${(durationMs / 1000).toFixed(1)}s`,
|
||
].join("\n");
|
||
await notifyTelegram(notifyChatId, text);
|
||
}
|
||
|
||
const donePath = path.join(DONE, `${task.taskId}.json`);
|
||
await fsp.writeFile(donePath, JSON.stringify(result, null, 2) + "\n", "utf-8");
|
||
await fsp.unlink(processingPath).catch(() => {});
|
||
} catch (err) {
|
||
result.ok = false;
|
||
result.finishedAt = nowIso();
|
||
result.error = err instanceof Error ? err.message : String(err);
|
||
|
||
const durationMs = Date.now() - started;
|
||
result.stats.durationMs = durationMs;
|
||
|
||
const notifyChatId = task.notify?.telegramChatId;
|
||
if (notifyChatId) {
|
||
await notifyTelegram(
|
||
notifyChatId,
|
||
`Lesson Extract ❌ (${task.agentId || "main"})\n` +
|
||
`taskId: ${task.taskId?.slice(0, 8) || "unknown"}\n` +
|
||
`error: ${result.error}\n` +
|
||
`time: ${(durationMs / 1000).toFixed(1)}s`
|
||
);
|
||
}
|
||
|
||
const errPath = path.join(ERROR, `${task.taskId}.json`);
|
||
await fsp.writeFile(errPath, JSON.stringify(result, null, 2) + "\n", "utf-8");
|
||
await fsp.unlink(processingPath).catch(() => {});
|
||
}
|
||
}
|
||
|
||
async function drainInboxOnce() {
|
||
ensureDirs();
|
||
const files = (await fsp.readdir(INBOX)).filter((f) => f.endsWith(".json")).sort();
|
||
for (const f of files) {
|
||
await processTaskFile(path.join(INBOX, f));
|
||
}
|
||
}
|
||
|
||
async function main() {
|
||
ensureDirs();
|
||
await drainInboxOnce();
|
||
if (ONCE) return;
|
||
|
||
const watcher = fs.watch(INBOX, async (_eventType, filename) => {
|
||
if (!filename || !filename.endsWith(".json")) return;
|
||
const full = path.join(INBOX, filename);
|
||
setTimeout(() => {
|
||
processTaskFile(full).catch(() => {});
|
||
}, 150);
|
||
});
|
||
|
||
let alive = true;
|
||
const shutdown = () => {
|
||
alive = false;
|
||
watcher.close();
|
||
};
|
||
process.on("SIGINT", shutdown);
|
||
process.on("SIGTERM", shutdown);
|
||
|
||
while (alive) {
|
||
await new Promise((r) => setTimeout(r, 5_000));
|
||
}
|
||
}
|
||
|
||
main().catch((err) => {
|
||
console.error(String(err));
|
||
process.exit(1);
|
||
});
|