marian/scripts/extract-pdf-text.mjs
2026-01-24 17:10:06 +01:00

36 lines
1.1 KiB
JavaScript

import fs from 'node:fs';
import path from 'node:path';
import process from 'node:process';
const projectRoot = process.cwd();
const inPath = path.join(projectRoot, 'public', '335961_DL_ZahnPRIVAT_VKB_05_2024.pdf');
const outPath = path.join(projectRoot, 'public', '335961_DL_ZahnPRIVAT_VKB_05_2024.txt');
const pdfjs = await import('pdfjs-dist/legacy/build/pdf.mjs');
const data = new Uint8Array(fs.readFileSync(inPath));
const loadingTask = pdfjs.getDocument({ data });
const doc = await loadingTask.promise;
let text = '';
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
const page = await doc.getPage(pageNum);
const content = await page.getTextContent();
const pageText = content.items
.map((it) => (typeof it?.str === 'string' ? it.str : ''))
.filter(Boolean)
.join(' ');
text += `\n\n--- Page ${pageNum} ---\n`;
text += pageText;
text += '\n';
}
fs.writeFileSync(outPath, text, 'utf8');
console.log('ok');
console.log('pages:', doc.numPages);
console.log('chars:', text.length);
console.log('out:', outPath);
console.log('preview:\n' + text.slice(0, 1200));