import fs from 'node:fs'; import path from 'node:path'; import process from 'node:process'; const projectRoot = process.cwd(); const inPath = path.join(projectRoot, 'public', '335961_DL_ZahnPRIVAT_VKB_05_2024.pdf'); const outPath = path.join(projectRoot, 'public', '335961_DL_ZahnPRIVAT_VKB_05_2024.txt'); const pdfjs = await import('pdfjs-dist/legacy/build/pdf.mjs'); const data = new Uint8Array(fs.readFileSync(inPath)); const loadingTask = pdfjs.getDocument({ data }); const doc = await loadingTask.promise; let text = ''; for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) { const page = await doc.getPage(pageNum); const content = await page.getTextContent(); const pageText = content.items .map((it) => (typeof it?.str === 'string' ? it.str : '')) .filter(Boolean) .join(' '); text += `\n\n--- Page ${pageNum} ---\n`; text += pageText; text += '\n'; } fs.writeFileSync(outPath, text, 'utf8'); console.log('ok'); console.log('pages:', doc.numPages); console.log('chars:', text.length); console.log('out:', outPath); console.log('preview:\n' + text.slice(0, 1200));