36 lines
1.1 KiB
JavaScript
36 lines
1.1 KiB
JavaScript
import fs from 'node:fs';
|
|
import path from 'node:path';
|
|
import process from 'node:process';
|
|
|
|
const projectRoot = process.cwd();
|
|
const inPath = path.join(projectRoot, 'public', '335961_DL_ZahnPRIVAT_VKB_05_2024.pdf');
|
|
const outPath = path.join(projectRoot, 'public', '335961_DL_ZahnPRIVAT_VKB_05_2024.txt');
|
|
|
|
const pdfjs = await import('pdfjs-dist/legacy/build/pdf.mjs');
|
|
|
|
const data = new Uint8Array(fs.readFileSync(inPath));
|
|
const loadingTask = pdfjs.getDocument({ data });
|
|
const doc = await loadingTask.promise;
|
|
|
|
let text = '';
|
|
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
|
|
const page = await doc.getPage(pageNum);
|
|
const content = await page.getTextContent();
|
|
const pageText = content.items
|
|
.map((it) => (typeof it?.str === 'string' ? it.str : ''))
|
|
.filter(Boolean)
|
|
.join(' ');
|
|
|
|
text += `\n\n--- Page ${pageNum} ---\n`;
|
|
text += pageText;
|
|
text += '\n';
|
|
}
|
|
|
|
fs.writeFileSync(outPath, text, 'utf8');
|
|
|
|
console.log('ok');
|
|
console.log('pages:', doc.numPages);
|
|
console.log('chars:', text.length);
|
|
console.log('out:', outPath);
|
|
console.log('preview:\n' + text.slice(0, 1200));
|