Files
doc-manager/apps/api/src/modules/templates/extract.ts
T
2026-05-01 12:40:03 +03:00

115 lines
4.6 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { execFile } from 'node:child_process';
import { mkdtemp, readFile, readdir, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
import { createRequire } from 'node:module';
import mammoth from 'mammoth';
// pdf-parse — CJS-only. Прямой ESM-импорт ломается из-за auto-test mode в index.js.
// createRequire даёт CJS loader, в котором module.parent определён → test-mode не активен.
const require = createRequire(import.meta.url);
const pdfParse: (buf: Buffer) => Promise<{ text: string; numpages: number }> = require('pdf-parse');
const execFileP = promisify(execFile);
export type ExtractedSource = 'docx' | 'pdf-text' | 'pdf-ocr' | 'image-ocr';
export type Extracted = { text: string; source: ExtractedSource; pageCount?: number };
const OCR_LANGS = 'rus+eng';
/**
* Извлекает читаемый текст из произвольного загруженного документа.
* - DOCX → mammoth (без OCR);
* - PDF с текстовым слоем → pdf-parse;
* - PDF-скан → pdftoppm в PNG → tesseract;
* - PNG/JPG → tesseract напрямую.
*
* Если в системе нет tesseract/poppler — для не-DOCX вернётся ошибка с code=NO_OCR.
*/
export async function extractText(buf: Buffer, mime: string, filename: string): Promise<Extracted> {
const lower = (mime + ' ' + filename).toLowerCase();
if (lower.includes('wordprocessingml') || lower.endsWith('.docx')) {
const r = await mammoth.extractRawText({ buffer: buf });
return { text: cleanup(r.value), source: 'docx' };
}
if (mime.includes('pdf') || lower.endsWith('.pdf')) {
return extractFromPdf(buf);
}
if (mime.startsWith('image/') || /\.(png|jpe?g|tiff?|bmp)$/i.test(filename)) {
const text = await ocrImageBuffer(buf, '.png');
return { text: cleanup(text), source: 'image-ocr' };
}
throw Object.assign(new Error(`Неподдерживаемый формат: ${mime || filename}`), {
code: 'UNSUPPORTED_MIME',
});
}
async function extractFromPdf(buf: Buffer): Promise<Extracted> {
const parsed = await pdfParse(buf);
const text = cleanup(parsed.text);
// Эвристика: если текст-слой даёт меньше 60 символов на страницу — это сканированный PDF.
const charsPerPage = parsed.numpages > 0 ? text.length / parsed.numpages : text.length;
if (text.length > 200 && charsPerPage > 60) {
return { text, source: 'pdf-text', pageCount: parsed.numpages };
}
// OCR pipeline
const ocr = await ocrPdfBuffer(buf);
return { text: cleanup(ocr), source: 'pdf-ocr', pageCount: parsed.numpages };
}
async function ocrImageBuffer(buf: Buffer, ext: string): Promise<string> {
const dir = await mkdtemp(join(tmpdir(), 'docmgr-ocr-'));
try {
const inFile = join(dir, `in${ext}`);
await writeFile(inFile, buf);
const { stdout } = await execFileP('tesseract', [inFile, 'stdout', '-l', OCR_LANGS], {
maxBuffer: 32 * 1024 * 1024,
});
return stdout;
} catch (e) {
if ((e as { code?: string }).code === 'ENOENT') {
throw Object.assign(new Error('Tesseract не установлен в окружении. PDF-сканы обработать не получится.'), {
code: 'NO_OCR',
});
}
throw e;
} finally {
await rm(dir, { recursive: true, force: true });
}
}
async function ocrPdfBuffer(buf: Buffer): Promise<string> {
const dir = await mkdtemp(join(tmpdir(), 'docmgr-pdf-'));
try {
const inFile = join(dir, 'in.pdf');
await writeFile(inFile, buf);
// pdftoppm input.pdf out -png -r 200 → out-1.png, out-2.png ...
try {
await execFileP('pdftoppm', [inFile, join(dir, 'out'), '-png', '-r', '200']);
} catch (e) {
if ((e as { code?: string }).code === 'ENOENT') {
throw Object.assign(new Error('pdftoppm (poppler-utils) не установлен.'), { code: 'NO_OCR' });
}
throw e;
}
const files = (await readdir(dir)).filter((f) => f.startsWith('out-') && f.endsWith('.png')).sort();
const parts: string[] = [];
for (const f of files) {
const pageBuf = await readFile(join(dir, f));
parts.push(await ocrImageBuffer(pageBuf, '.png'));
}
return parts.join('\n\n');
} finally {
await rm(dir, { recursive: true, force: true });
}
}
function cleanup(s: string): string {
return s
.replace(/­/g, '') // soft hyphens
.replace(/[ \t]+\n/g, '\n')
.replace(/\n{3,}/g, '\n\n')
.trim();
}