feat: import DOCX/PDF/scanned templates via DeepSeek recognition

Backend pipeline: - POST /api/templates/import (multipart, max 25 MB) - extract.ts: DOCX→mammoth, PDF→pdf-parse, fallback to OCR via tesseract+poppler-utils (pdftoppm renders pages to PNG, tesseract reads with rus+eng) - deepseek.ts: chat completions client with strict JSON response_format - recognize.ts: structured prompt that produces simplified DocBody (string text), postprocessor wraps text in TipTap-compatible JSON, validates with zod schema - prompt enforces placeholder substitution: {{customer.*}}, {{executor.*}}, {{contract.number}}, {{contract.date}}, {{today}} - error codes: NO_OCR / NO_DEEPSEEK_KEY / UNSUPPORTED_MIME / INVALID_DOC_BODY Dockerfile: apk add tesseract-ocr (+rus +eng data), poppler-utils, imagemagick Frontend: - Templates page: ⤴ Загрузить документ → file picker (.docx,.pdf,.png,.jpg) - doc type selector (contract/invoice/act/upd) - import-banner with spinner shows uploading→analyzing stages - on success navigates to /templates/:id (TemplateEdit) for review Reuses DEEPSEEK_API_KEY pattern from Hall-planer. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 11:40:28 +03:00
parent 624d378bb5
commit e768d30fb6
13 changed files with 1114 additions and 7 deletions
@@ -0,0 +1,48 @@
+import { env } from '../../env.js';
+
+type ChatMessage = { role: 'system' | 'user' | 'assistant'; content: string };
+
+type DeepSeekResponse = {
+  choices: { message: { content: string } }[];
+};
+
+export async function deepseekJsonChat<T>(messages: ChatMessage[], opts: { temperature?: number; maxTokens?: number } = {}): Promise<T> {
+  if (!env.DEEPSEEK_API_KEY) {
+    throw Object.assign(new Error('DeepSeek API ключ не настроен (DEEPSEEK_API_KEY).'), {
+      code: 'NO_DEEPSEEK_KEY',
+    });
+  }
+  const res = await fetch(`${env.DEEPSEEK_BASE_URL}/chat/completions`, {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${env.DEEPSEEK_API_KEY}`,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify({
+      model: env.DEEPSEEK_MODEL,
+      messages,
+      response_format: { type: 'json_object' },
+      temperature: opts.temperature ?? 0.2,
+      max_tokens: opts.maxTokens ?? 8000,
+    }),
+    signal: AbortSignal.timeout(120_000),
+  });
+  if (!res.ok) {
+    const body = await res.text().catch(() => '');
+    throw Object.assign(new Error(`DeepSeek ${res.status}: ${body.slice(0, 300)}`), {
+      code: 'DEEPSEEK_HTTP_ERROR',
+      status: res.status,
+    });
+  }
+  const data = (await res.json()) as DeepSeekResponse;
+  const content = data.choices?.[0]?.message?.content;
+  if (!content) throw new Error('DeepSeek: пустой ответ');
+  try {
+    return JSON.parse(content) as T;
+  } catch {
+    throw Object.assign(new Error('DeepSeek вернул не валидный JSON'), {
+      code: 'DEEPSEEK_BAD_JSON',
+      raw: content,
+    });
+  }
+}
@@ -0,0 +1,114 @@
+import { execFile } from 'node:child_process';
+import { mkdtemp, readFile, readdir, rm, writeFile } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { promisify } from 'node:util';
+import mammoth from 'mammoth';
+import * as pdfParseNs from 'pdf-parse';
+
+// pdf-parse в ESM экспортирует default или сам namespace — нормализуем.
+const pdfParse: (buf: Buffer) => Promise<{ text: string; numpages: number }> =
+  // @ts-expect-error namespace shape varies between cjs/esm
+  (pdfParseNs.default ?? pdfParseNs) as never;
+
+const execFileP = promisify(execFile);
+
+export type ExtractedSource = 'docx' | 'pdf-text' | 'pdf-ocr' | 'image-ocr';
+export type Extracted = { text: string; source: ExtractedSource; pageCount?: number };
+
+const OCR_LANGS = 'rus+eng';
+
+/**
+ * Извлекает читаемый текст из произвольного загруженного документа.
+ * - DOCX → mammoth (без OCR);
+ * - PDF с текстовым слоем → pdf-parse;
+ * - PDF-скан → pdftoppm в PNG → tesseract;
+ * - PNG/JPG → tesseract напрямую.
+ *
+ * Если в системе нет tesseract/poppler — для не-DOCX вернётся ошибка с code=NO_OCR.
+ */
+export async function extractText(buf: Buffer, mime: string, filename: string): Promise<Extracted> {
+  const lower = (mime + ' ' + filename).toLowerCase();
+  if (lower.includes('wordprocessingml') || lower.endsWith('.docx')) {
+    const r = await mammoth.extractRawText({ buffer: buf });
+    return { text: cleanup(r.value), source: 'docx' };
+  }
+  if (mime.includes('pdf') || lower.endsWith('.pdf')) {
+    return extractFromPdf(buf);
+  }
+  if (mime.startsWith('image/') || /\.(png|jpe?g|tiff?|bmp)$/i.test(filename)) {
+    const text = await ocrImageBuffer(buf, '.png');
+    return { text: cleanup(text), source: 'image-ocr' };
+  }
+  throw Object.assign(new Error(`Неподдерживаемый формат: ${mime || filename}`), {
+    code: 'UNSUPPORTED_MIME',
+  });
+}
+
+async function extractFromPdf(buf: Buffer): Promise<Extracted> {
+  const parsed = await pdfParse(buf);
+  const text = cleanup(parsed.text);
+  // Эвристика: если текст-слой даёт меньше 60 символов на страницу — это сканированный PDF.
+  const charsPerPage = parsed.numpages > 0 ? text.length / parsed.numpages : text.length;
+  if (text.length > 200 && charsPerPage > 60) {
+    return { text, source: 'pdf-text', pageCount: parsed.numpages };
+  }
+  // OCR pipeline
+  const ocr = await ocrPdfBuffer(buf);
+  return { text: cleanup(ocr), source: 'pdf-ocr', pageCount: parsed.numpages };
+}
+
+async function ocrImageBuffer(buf: Buffer, ext: string): Promise<string> {
+  const dir = await mkdtemp(join(tmpdir(), 'docmgr-ocr-'));
+  try {
+    const inFile = join(dir, `in${ext}`);
+    await writeFile(inFile, buf);
+    const { stdout } = await execFileP('tesseract', [inFile, 'stdout', '-l', OCR_LANGS], {
+      maxBuffer: 32 * 1024 * 1024,
+    });
+    return stdout;
+  } catch (e) {
+    if ((e as { code?: string }).code === 'ENOENT') {
+      throw Object.assign(new Error('Tesseract не установлен в окружении. PDF-сканы обработать не получится.'), {
+        code: 'NO_OCR',
+      });
+    }
+    throw e;
+  } finally {
+    await rm(dir, { recursive: true, force: true });
+  }
+}
+
+async function ocrPdfBuffer(buf: Buffer): Promise<string> {
+  const dir = await mkdtemp(join(tmpdir(), 'docmgr-pdf-'));
+  try {
+    const inFile = join(dir, 'in.pdf');
+    await writeFile(inFile, buf);
+    // pdftoppm input.pdf out -png -r 200 → out-1.png, out-2.png ...
+    try {
+      await execFileP('pdftoppm', [inFile, join(dir, 'out'), '-png', '-r', '200']);
+    } catch (e) {
+      if ((e as { code?: string }).code === 'ENOENT') {
+        throw Object.assign(new Error('pdftoppm (poppler-utils) не установлен.'), { code: 'NO_OCR' });
+      }
+      throw e;
+    }
+    const files = (await readdir(dir)).filter((f) => f.startsWith('out-') && f.endsWith('.png')).sort();
+    const parts: string[] = [];
+    for (const f of files) {
+      const pageBuf = await readFile(join(dir, f));
+      parts.push(await ocrImageBuffer(pageBuf, '.png'));
+    }
+    return parts.join('\n\n');
+  } finally {
+    await rm(dir, { recursive: true, force: true });
+  }
+}
+
+function cleanup(s: string): string {
+  return s
+    .replace(//g, '') // soft hyphens
+    .replace(/[ \t]+\n/g, '\n')
+    .replace(/\n{3,}/g, '\n\n')
+    .trim();
+}
@@ -0,0 +1,95 @@
+import type { FastifyInstance } from 'fastify';
+import { Prisma } from '@prisma/client';
+import { prisma } from '../../db.js';
+import { getOrganizationId } from '../../lib/org.js';
+import { extractText } from './extract.js';
+import { recognizeTemplate } from './recognize.js';
+
+const MAX_BYTES = 20 * 1024 * 1024; // 20 MB
+
+export async function templatesImportRoutes(app: FastifyInstance) {
+  app.post(
+    '/api/templates/import',
+    { preHandler: app.requireDocPermission('user') },
+    async (req, reply) => {
+      const orgId = getOrganizationId(req);
+      const file = await req.file({ limits: { fileSize: MAX_BYTES } });
+      if (!file) {
+        reply.code(400).send({ error: 'no_file' });
+        return;
+      }
+      const buf = await file.toBuffer();
+      const filename = file.filename || 'template';
+      const mime = file.mimetype || '';
+
+      // Опц. имя/тип передаются полями формы; если нет — определит LLM
+      const fields = (file.fields ?? {}) as Record<string, { value?: string }>;
+      const userName = typeof fields.name?.value === 'string' ? fields.name.value : null;
+      const userDocType = typeof fields.docType?.value === 'string' ? fields.docType.value : null;
+
+      try {
+        // 1) Извлекаем текст
+        const extracted = await extractText(buf, mime, filename);
+        if (extracted.text.trim().length < 30) {
+          reply.code(422).send({
+            error: 'empty_extracted_text',
+            message: 'Не удалось извлечь читаемый текст. Возможно, скан плохого качества.',
+            source: extracted.source,
+          });
+          return;
+        }
+
+        // 2) Распознаём через DeepSeek
+        const recognized = await recognizeTemplate(extracted.text);
+        const docType = (userDocType as 'contract' | 'invoice' | 'act' | 'upd') ?? recognized.docType;
+        const name = userName?.trim() || recognized.title || `Импорт ${new Date().toLocaleDateString('ru-RU')}`;
+
+        const created = await prisma.documentTemplate.create({
+          data: {
+            organizationId: orgId,
+            docType,
+            name,
+            body: recognized.docBody as Prisma.InputJsonValue,
+          },
+        });
+
+        reply.code(201).send({
+          template: created,
+          extractedFrom: extracted.source,
+          textLength: extracted.text.length,
+        });
+      } catch (e) {
+        const err = e as { code?: string; message?: string; status?: number };
+        if (err.code === 'NO_DEEPSEEK_KEY') {
+          reply.code(503).send({ error: 'no_deepseek_key', message: err.message });
+          return;
+        }
+        if (err.code === 'NO_OCR') {
+          reply.code(503).send({ error: 'no_ocr', message: err.message });
+          return;
+        }
+        if (err.code === 'UNSUPPORTED_MIME') {
+          reply.code(415).send({ error: 'unsupported_mime', message: err.message });
+          return;
+        }
+        if (err.code === 'INVALID_DOC_BODY') {
+          reply.code(502).send({
+            error: 'invalid_doc_body',
+            message: 'LLM вернула структуру, не прошедшую валидацию. Попробуйте загрузить ещё раз или отредактировать вручную.',
+          });
+          return;
+        }
+        if (err.code === 'DEEPSEEK_HTTP_ERROR') {
+          reply.code(502).send({
+            error: 'deepseek_error',
+            message: err.message,
+            status: err.status ?? 502,
+          });
+          return;
+        }
+        app.log.error({ err: e }, 'template import failed');
+        reply.code(500).send({ error: 'import_failed', message: err.message ?? 'unknown' });
+      }
+    },
+  );
+}
@@ -0,0 +1,156 @@
+import type { DocType } from '@prisma/client';
+import { DocBody as DocBodySchema } from '@doc-manager/shared';
+import { deepseekJsonChat } from './deepseek.js';
+
+// LLM возвращает упрощённую структуру (string text вместо TipTap-JSON).
+// Конвертим её в полный DocBody с RichText на нашей стороне.
+
+type LlmBlock =
+  | { type: 'heading'; level: 1 | 2 | 3; text: string }
+  | { type: 'paragraph'; text: string }
+  | { type: 'party'; role: 'executor' | 'customer'; bind: { kind: 'self' | 'client' } }
+  | { type: 'services_table'; columns?: string[]; lines?: [] }
+  | { type: 'totals'; showVat?: boolean; showInWords?: boolean }
+  | { type: 'terms'; text: string }
+  | { type: 'signatures'; sides: ('executor' | 'customer')[] }
+  | { type: 'custom_text'; text: string }
+  | { type: 'page_break' };
+
+type LlmResult = {
+  docType?: 'contract' | 'invoice' | 'act' | 'upd';
+  title?: string;
+  blocks: LlmBlock[];
+};
+
+const SYSTEM_PROMPT = `Ты — парсер юридических документов на русском языке. Получаешь plain-text договора, счёта, акта или УПД и возвращаешь его структуру строго в JSON формате DocBody.
+
+Цель: создать ШАБЛОН, в котором конкретные данные сторон, номера, даты — заменены на плейсхолдеры. Шаблон потом будет инстансироваться для конкретного клиента и документа.
+
+Правила замены на плейсхолдеры (заменяй ТОЛЬКО эти сущности, остальной текст оставь как есть):
+- Конкретный номер договора/счёта → {{contract.number}}
+- Конкретная дата документа → {{contract.date}}
+- Текущая дата (на момент рендера) → {{today}}
+- Реквизиты исполнителя (наша компания):
+  {{executor.name}}, {{executor.inn}}, {{executor.kpp}}, {{executor.ogrn}}, {{executor.legalAddress}},
+  {{executor.signatoryName}}, {{executor.signatoryPosition}},
+  {{executor.bankName}}, {{executor.bankBik}}, {{executor.bankAccount}}
+- Реквизиты заказчика (клиента):
+  {{customer.name}}, {{customer.inn}}, {{customer.kpp}}, {{customer.address}},
+  {{customer.email}}, {{customer.phone}}, {{customer.contactPerson}}
+
+Структура ответа (JSON-объект):
+{
+  "docType": "contract" | "invoice" | "act" | "upd",
+  "title": "Договор оказания услуг",
+  "blocks": [
+    { "type": "heading", "level": 1, "text": "Договор оказания услуг № {{contract.number}} от {{contract.date}}" },
+    { "type": "paragraph", "text": "..." },
+    { "type": "party", "role": "executor", "bind": { "kind": "self" } },
+    { "type": "party", "role": "customer", "bind": { "kind": "client" } },
+    { "type": "services_table", "columns": ["name","qty","unit","price","vat","sum"], "lines": [] },
+    { "type": "totals", "showVat": true, "showInWords": true },
+    { "type": "terms", "text": "..." },
+    { "type": "signatures", "sides": ["executor","customer"] },
+    { "type": "custom_text", "text": "..." },
+    { "type": "page_break" }
+  ]
+}
+
+Важные правила:
+1. Если в документе встречается реквизитный блок одной из сторон (ИНН/КПП/адрес/банк) — выдай ОДИН блок "party" с правильным role, не добавляй параграф с текстовым перечислением реквизитов.
+2. Услуги/работы в табличной форме — НЕ переписывай построчно, ставь блок "services_table" (lines пустой массив, строки пользователь добавит вручную).
+3. Если есть итоговые суммы (Итого, в т.ч. НДС, сумма прописью) — ставь блок "totals" со включенными нужными опциями.
+4. Подписи в конце — блок "signatures" с указанием каких сторон видно.
+5. Нумерованные пункты «1. Предмет договора», «2. Цена и порядок расчётов» и т.п. — каждый раздел как блок "terms" (заголовок раздела можно положить отдельным "heading" level 2).
+6. Не выдумывай блоки, которых нет в документе.
+7. text внутри блоков — обычная строка (не TipTap JSON), может содержать \\n для новых параграфов внутри блока.
+8. Заполняй "title" коротким названием документа.
+9. Если не уверен в типе документа — ставь "contract".`;
+
+function uid(): string {
+  return Math.random().toString(36).slice(2, 11);
+}
+
+function plainToRich(text: string): unknown {
+  const lines = text.split(/\r?\n/);
+  return {
+    type: 'doc',
+    content: lines.map((line) => ({
+      type: 'paragraph',
+      content: line ? [{ type: 'text', text: line }] : [],
+    })),
+  };
+}
+
+function llmToDocBody(result: LlmResult): { docBody: unknown; docType: DocType; title: string } {
+  const docType: DocType = (result.docType as DocType) ?? 'contract';
+  const title = result.title ?? 'Документ';
+  const blocks = (result.blocks ?? []).map((b) => {
+    const id = uid();
+    switch (b.type) {
+      case 'heading':
+        return { id, type: 'heading', level: clampLevel(b.level), text: plainToRich(b.text) };
+      case 'paragraph':
+        return { id, type: 'paragraph', text: plainToRich(b.text) };
+      case 'party':
+        return {
+          id,
+          type: 'party',
+          role: b.role,
+          bind: b.bind?.kind === 'self' ? { kind: 'self' as const } : { kind: 'client' as const },
+        };
+      case 'services_table':
+        return {
+          id,
+          type: 'services_table',
+          columns: ['name', 'qty', 'unit', 'price', 'vat', 'sum'] as const,
+          lines: [],
+        };
+      case 'totals':
+        return { id, type: 'totals', showVat: b.showVat ?? true, showInWords: b.showInWords ?? true };
+      case 'terms':
+        return { id, type: 'terms', text: plainToRich(b.text) };
+      case 'signatures':
+        return { id, type: 'signatures', sides: b.sides?.length ? b.sides : (['executor', 'customer'] as const) };
+      case 'custom_text':
+        return { id, type: 'custom_text', text: plainToRich(b.text) };
+      case 'page_break':
+        return { id, type: 'page_break' };
+      default:
+        return { id, type: 'custom_text', text: plainToRich('') };
+    }
+  });
+  const docBody = { version: 1, blocks, vars: {} };
+  return { docBody, docType, title };
+}
+
+function clampLevel(l: number): 1 | 2 | 3 {
+  if (l === 1 || l === 2 || l === 3) return l;
+  return 1;
+}
+
+export async function recognizeTemplate(text: string): Promise<{ docBody: unknown; docType: DocType; title: string; raw?: unknown }> {
+  // Документ может быть длинным — DeepSeek-chat умеет до 64k context, сожмём только если совсем огромный
+  const trimmed = text.length > 60_000 ? text.slice(0, 60_000) + '\n\n…[документ обрезан]' : text;
+
+  const llm = await deepseekJsonChat<LlmResult>(
+    [
+      { role: 'system', content: SYSTEM_PROMPT },
+      { role: 'user', content: trimmed },
+    ],
+    { temperature: 0.1, maxTokens: 8000 },
+  );
+
+  const built = llmToDocBody(llm);
+
+  // Финальная валидация на нашей стороне
+  const parsed = DocBodySchema.safeParse(built.docBody);
+  if (!parsed.success) {
+    throw Object.assign(new Error('DocBody после LLM не прошёл валидацию'), {
+      code: 'INVALID_DOC_BODY',
+      issues: parsed.error.flatten(),
+      raw: built.docBody,
+    });
+  }
+  return { ...built, docBody: parsed.data, raw: llm };
+}