diff --git a/.env.example b/.env.example index 3b1655d..fee0b32 100644 --- a/.env.example +++ b/.env.example @@ -12,6 +12,15 @@ VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.1 VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/ VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/ +# OCR assets (optional) +# Set all three together for self-hosted or air-gapped OCR. +# Leave empty to use Tesseract.js runtime defaults. +VITE_TESSERACT_WORKER_URL= +VITE_TESSERACT_CORE_URL= +VITE_TESSERACT_LANG_URL= +VITE_TESSERACT_AVAILABLE_LANGUAGES= +VITE_OCR_FONT_BASE_URL= + # Default UI language (build-time) # Supported: en, ar, be, fr, de, es, zh, zh-TW, vi, tr, id, it, pt, nl, da VITE_DEFAULT_LANGUAGE= diff --git a/Dockerfile b/Dockerfile index 12520e5..1e962ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,6 +35,18 @@ ENV VITE_WASM_PYMUPDF_URL=$VITE_WASM_PYMUPDF_URL ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL +# OCR asset URLs (optional, used for self-hosted or air-gapped OCR) +ARG VITE_TESSERACT_WORKER_URL +ARG VITE_TESSERACT_CORE_URL +ARG VITE_TESSERACT_LANG_URL +ARG VITE_TESSERACT_AVAILABLE_LANGUAGES +ARG VITE_OCR_FONT_BASE_URL +ENV VITE_TESSERACT_WORKER_URL=$VITE_TESSERACT_WORKER_URL +ENV VITE_TESSERACT_CORE_URL=$VITE_TESSERACT_CORE_URL +ENV VITE_TESSERACT_LANG_URL=$VITE_TESSERACT_LANG_URL +ENV VITE_TESSERACT_AVAILABLE_LANGUAGES=$VITE_TESSERACT_AVAILABLE_LANGUAGES +ENV VITE_OCR_FONT_BASE_URL=$VITE_OCR_FONT_BASE_URL + # Default UI language (e.g. en, fr, de, es, zh, ar) ARG VITE_DEFAULT_LANGUAGE ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE diff --git a/Dockerfile.nonroot b/Dockerfile.nonroot index dc1e1a8..0599daf 100644 --- a/Dockerfile.nonroot +++ b/Dockerfile.nonroot @@ -32,6 +32,17 @@ ENV VITE_WASM_PYMUPDF_URL=$VITE_WASM_PYMUPDF_URL ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL +ARG VITE_TESSERACT_WORKER_URL +ARG VITE_TESSERACT_CORE_URL +ARG VITE_TESSERACT_LANG_URL +ARG VITE_TESSERACT_AVAILABLE_LANGUAGES +ARG VITE_OCR_FONT_BASE_URL +ENV VITE_TESSERACT_WORKER_URL=$VITE_TESSERACT_WORKER_URL +ENV VITE_TESSERACT_CORE_URL=$VITE_TESSERACT_CORE_URL +ENV VITE_TESSERACT_LANG_URL=$VITE_TESSERACT_LANG_URL +ENV VITE_TESSERACT_AVAILABLE_LANGUAGES=$VITE_TESSERACT_AVAILABLE_LANGUAGES +ENV VITE_OCR_FONT_BASE_URL=$VITE_OCR_FONT_BASE_URL + # Default UI language (e.g. en, fr, de, es, zh, ar) ARG VITE_DEFAULT_LANGUAGE ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE diff --git a/README.md b/README.md index 0928f93..d7711b4 100644 --- a/README.md +++ b/README.md @@ -465,6 +465,11 @@ The default URLs are set in `.env.production`: VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/ VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/ VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/ +VITE_TESSERACT_WORKER_URL= +VITE_TESSERACT_CORE_URL= +VITE_TESSERACT_LANG_URL= +VITE_TESSERACT_AVAILABLE_LANGUAGES= +VITE_OCR_FONT_BASE_URL= ``` To override via Docker build args: @@ -474,11 +479,18 @@ docker build \ --build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \ --build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \ --build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \ + --build-arg VITE_TESSERACT_WORKER_URL=https://your-server.com/ocr/worker.min.js \ + --build-arg VITE_TESSERACT_CORE_URL=https://your-server.com/ocr/core \ + --build-arg VITE_TESSERACT_LANG_URL=https://your-server.com/ocr/lang-data \ + --build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \ + --build-arg VITE_OCR_FONT_BASE_URL=https://your-server.com/ocr/fonts \ -t bentopdf . ``` To disable a module (require manual user config via Advanced Settings), set its variable to an empty string. +For OCR, either leave all `VITE_TESSERACT_*` variables empty and use the default online assets, or set the worker/core/lang URLs together for self-hosted/offline OCR. If your self-hosted bundle only includes a subset such as `eng,deu`, also set `VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu` so the UI only shows bundled languages and OCR fails with a descriptive message for unsupported ones. For fully offline searchable-PDF output, also set `VITE_OCR_FONT_BASE_URL` to the internal directory that serves the bundled OCR text-layer fonts. + Users can also override these defaults per-browser via **Advanced Settings** in the UI — user overrides take priority over the environment defaults. > [!IMPORTANT] @@ -496,6 +508,12 @@ The included `prepare-airgap.sh` script automates the entire process — downloa git clone https://github.com/alam00000/bentopdf.git cd bentopdf +# Show supported OCR language codes (for --ocr-languages) +bash scripts/prepare-airgap.sh --list-ocr-languages + +# Search OCR language codes by name or abbreviation +bash scripts/prepare-airgap.sh --search-ocr-language german + # Interactive mode — prompts for all options bash scripts/prepare-airgap.sh @@ -508,7 +526,9 @@ This produces a bundle directory containing: ``` bentopdf-airgap-bundle/ bentopdf.tar # Docker image - *.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF) + *.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF, Tesseract) + tesseract-langdata/ # OCR traineddata files + ocr-fonts/ # OCR text-layer font files setup.sh # Setup script for the air-gapped side README.md # Instructions ``` @@ -525,23 +545,28 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
Script options -| Flag | Description | Default | -| ----------------------- | ------------------------------------------------ | --------------------------------- | -| `--wasm-base-url ` | Where WASMs will be hosted internally | _(required, prompted if missing)_ | -| `--image-name ` | Docker image tag | `bentopdf` | -| `--output-dir ` | Output bundle directory | `./bentopdf-airgap-bundle` | -| `--simple-mode` | Enable Simple Mode | off | -| `--base-url ` | Subdirectory base URL (e.g. `/pdf/`) | `/` | -| `--language ` | Default UI language (e.g. `fr`, `de`) | _(none)_ | -| `--brand-name ` | Custom brand name | _(none)_ | -| `--brand-logo ` | Logo path relative to `public/` | _(none)_ | -| `--footer-text ` | Custom footer text | _(none)_ | -| `--dockerfile ` | Dockerfile to use | `Dockerfile` | -| `--skip-docker` | Skip Docker build and export | off | -| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off | +| Flag | Description | Default | +| ------------------------------ | ------------------------------------------------ | --------------------------------- | +| `--wasm-base-url ` | Where WASMs will be hosted internally | _(required, prompted if missing)_ | +| `--image-name ` | Docker image tag | `bentopdf` | +| `--output-dir ` | Output bundle directory | `./bentopdf-airgap-bundle` | +| `--simple-mode` | Enable Simple Mode | off | +| `--base-url ` | Subdirectory base URL (e.g. `/pdf/`) | `/` | +| `--language ` | Default UI language (e.g. `fr`, `de`) | _(none)_ | +| `--brand-name ` | Custom brand name | _(none)_ | +| `--brand-logo ` | Logo path relative to `public/` | _(none)_ | +| `--footer-text ` | Custom footer text | _(none)_ | +| `--ocr-languages ` | Comma-separated OCR languages to bundle | `eng` | +| `--list-ocr-languages` | Print supported OCR codes and names, then exit | off | +| `--search-ocr-language ` | Search OCR codes by name or abbreviation | off | +| `--dockerfile ` | Dockerfile to use | `Dockerfile` | +| `--skip-docker` | Skip Docker build and export | off | +| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
+The interactive prompt also accepts `list` to print the full supported Tesseract code list and `search ` to find matches such as `search german` or `search chi`. + > [!IMPORTANT] > WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`. @@ -550,12 +575,18 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
If you prefer to do it manually without the script -**Step 1: Download the WASM packages** (on a machine with internet) +**Step 1: Download the WASM and OCR packages** (on a machine with internet) ```bash npm pack @bentopdf/pymupdf-wasm@0.11.16 npm pack @bentopdf/gs-wasm npm pack coherentpdf +npm pack tesseract.js@7.0.0 +npm pack tesseract.js-core@7.0.0 +mkdir -p tesseract-langdata +curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz +mkdir -p ocr-fonts +curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf ``` **Step 2: Build the Docker image with internal URLs** @@ -568,6 +599,10 @@ docker build \ --build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \ --build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \ --build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \ + --build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \ + --build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \ + --build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \ + --build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \ -t bentopdf . ``` @@ -585,6 +620,10 @@ Copy these files via USB drive, internal artifact repository, or approved transf - `bentopdf-pymupdf-wasm-0.11.14.tgz` — PyMuPDF WASM package - `bentopdf-gs-wasm-*.tgz` — Ghostscript WASM package - `coherentpdf-*.tgz` — CoherentPDF WASM package +- `tesseract.js-7.0.0.tgz` — Tesseract worker package +- `tesseract.js-core-7.0.0.tgz` — Tesseract core runtime package +- `tesseract-langdata/` — OCR traineddata files +- `ocr-fonts/` — OCR text-layer font files **Step 5: Set up inside the air-gapped network** @@ -593,16 +632,23 @@ Copy these files via USB drive, internal artifact repository, or approved transf docker load -i bentopdf.tar # Extract the WASM packages -mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf +mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1 tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1 tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1 +TEMP_TESS=$(mktemp -d) +tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS" +cp "$TEMP_TESS/package/dist/worker.min.js" ./wasm/ocr/worker.min.js +rm -rf "$TEMP_TESS" +tar xzf tesseract.js-core-7.0.0.tgz -C ./wasm/ocr/core --strip-components=1 +cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/ +cp ./ocr-fonts/* ./wasm/ocr/fonts/ # Run BentoPDF docker run -d -p 3000:8080 --restart unless-stopped bentopdf ``` -Make sure the WASM files are accessible at the URLs you configured in Step 2. +Make sure the files are accessible at the URLs you configured in Step 2, including `.../ocr/worker.min.js`, `.../ocr/core`, `.../ocr/lang-data`, and `.../ocr/fonts`.
@@ -613,6 +659,10 @@ Make sure the WASM files are accessible at the URLs you configured in Step 2. > VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ > VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ > VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ +> VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js +> VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core +> VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data +> VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts > ``` **Subdirectory Hosting:** diff --git a/docs/getting-started.md b/docs/getting-started.md index 1ac33fc..7d3a257 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -34,6 +34,9 @@ docker compose up -d Then open `http://localhost:3000` in your browser. +> [!NOTE] +> If you are preparing an air-gapped OCR deployment, you must host the OCR text-layer fonts internally in addition to the Tesseract worker, core runtime, and traineddata files. The full setup is documented in [Self-Hosting](/self-hosting/), including `VITE_OCR_FONT_BASE_URL` and the bundled `ocr-fonts/` directory. + ### Option 3: Build from Source ```bash diff --git a/docs/index.md b/docs/index.md index 5ba49b4..c3630e1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -32,5 +32,11 @@ features: details: Convert, edit, merge, split, compress, sign, OCR, and more. Everything you need in one place. - icon: 🌐 title: Self-Hostable - details: Deploy on your own infrastructure. Docker, Vercel, Netlify, AWS, or any static hosting. + details: Deploy on your own infrastructure. Docker, Vercel, Netlify, AWS, or fully air-gapped environments with self-hosted OCR workers, language data, and text-layer fonts. + +## Offline OCR + +If you self-host BentoPDF in an air-gapped or offline environment, OCR needs more than the Tesseract worker and traineddata files. Searchable PDF output also needs the OCR text-layer fonts to be served internally. + +See [Self-Hosting](/self-hosting/) for the full setup, including `VITE_OCR_FONT_BASE_URL`, the bundled `ocr-fonts/` directory, and the updated air-gap workflow. --- diff --git a/docs/self-hosting/docker.md b/docs/self-hosting/docker.md index 97123d7..e1c58d4 100644 --- a/docs/self-hosting/docker.md +++ b/docs/self-hosting/docker.md @@ -90,20 +90,27 @@ docker run -d -p 3000:8080 bentopdf:custom ## Environment Variables -| Variable | Description | Default | -| ----------------------- | ------------------------------- | -------------------------------------------------------------- | -| `SIMPLE_MODE` | Build without LibreOffice tools | `false` | -| `BASE_URL` | Deploy to subdirectory | `/` | -| `VITE_WASM_PYMUPDF_URL` | PyMuPDF WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/` | -| `VITE_WASM_GS_URL` | Ghostscript WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/` | -| `VITE_WASM_CPDF_URL` | CoherentPDF WASM module URL | `https://cdn.jsdelivr.net/npm/coherentpdf/dist/` | -| `VITE_DEFAULT_LANGUAGE` | Default UI language | `en` | -| `VITE_BRAND_NAME` | Custom brand name | `BentoPDF` | -| `VITE_BRAND_LOGO` | Logo path relative to `public/` | `images/favicon-no-bg.svg` | -| `VITE_FOOTER_TEXT` | Custom footer/copyright text | `© 2026 BentoPDF. All rights reserved.` | +| Variable | Description | Default | +| ------------------------------------ | ------------------------------------------- | -------------------------------------------------------------- | +| `SIMPLE_MODE` | Build without LibreOffice tools | `false` | +| `BASE_URL` | Deploy to subdirectory | `/` | +| `VITE_WASM_PYMUPDF_URL` | PyMuPDF WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/` | +| `VITE_WASM_GS_URL` | Ghostscript WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/` | +| `VITE_WASM_CPDF_URL` | CoherentPDF WASM module URL | `https://cdn.jsdelivr.net/npm/coherentpdf/dist/` | +| `VITE_TESSERACT_WORKER_URL` | OCR worker script URL | _(empty; use Tesseract.js default CDN)_ | +| `VITE_TESSERACT_CORE_URL` | OCR core runtime directory | _(empty; use Tesseract.js default CDN)_ | +| `VITE_TESSERACT_LANG_URL` | OCR traineddata directory | _(empty; use Tesseract.js default CDN)_ | +| `VITE_TESSERACT_AVAILABLE_LANGUAGES` | Comma-separated OCR languages exposed in UI | _(empty; show full catalog)_ | +| `VITE_OCR_FONT_BASE_URL` | OCR text-layer font directory | _(empty; use remote Noto font URLs)_ | +| `VITE_DEFAULT_LANGUAGE` | Default UI language | `en` | +| `VITE_BRAND_NAME` | Custom brand name | `BentoPDF` | +| `VITE_BRAND_LOGO` | Logo path relative to `public/` | `images/favicon-no-bg.svg` | +| `VITE_FOOTER_TEXT` | Custom footer/copyright text | `© 2026 BentoPDF. All rights reserved.` | WASM module URLs are pre-configured with CDN defaults — all advanced features work out of the box. Override these for air-gapped or self-hosted deployments. +For OCR, leave the `VITE_TESSERACT_*` variables empty to use the default online assets, or set all three together for self-hosted/offline OCR. Partial OCR overrides are rejected because the worker, core runtime, and traineddata directory must match. For fully offline searchable PDF output, also set `VITE_OCR_FONT_BASE_URL` so the OCR text-layer fonts are loaded from your internal server instead of the public Noto font URLs. + `VITE_DEFAULT_LANGUAGE` sets the UI language for first-time visitors. Supported values: `en`, `ar`, `be`, `fr`, `de`, `es`, `zh`, `zh-TW`, `vi`, `tr`, `id`, `it`, `pt`, `nl`, `da`. Users can still switch languages — this only changes the default. Example: @@ -137,35 +144,59 @@ Branding works in both full mode and Simple Mode, and can be combined with all o ```bash # 1. On a machine WITH internet — download WASM packages +bash scripts/prepare-airgap.sh --list-ocr-languages +bash scripts/prepare-airgap.sh --search-ocr-language german + +# 2. Download WASM/OCR packages npm pack @bentopdf/pymupdf-wasm@0.11.14 npm pack @bentopdf/gs-wasm npm pack coherentpdf +npm pack tesseract.js@7.0.0 +npm pack tesseract.js-core@7.0.0 +mkdir -p tesseract-langdata +curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz +mkdir -p ocr-fonts +curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf -# 2. Build the image with your internal server URLs +# 3. Build the image with your internal server URLs docker build \ --build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \ --build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \ --build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \ + --build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \ + --build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \ + --build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \ + --build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \ + --build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \ -t bentopdf . -# 3. Export the image +# 4. Export the image docker save bentopdf -o bentopdf.tar -# 4. Transfer bentopdf.tar + the .tgz WASM packages into the air-gapped network +# 5. Transfer bentopdf.tar + the .tgz packages + tesseract-langdata/ + ocr-fonts/ into the air-gapped network -# 5. Inside the air-gapped network — load and run +# 6. Inside the air-gapped network — load and run docker load -i bentopdf.tar # Extract WASM packages to your internal web server -mkdir -p /var/www/wasm/pymupdf /var/www/wasm/gs /var/www/wasm/cpdf +mkdir -p /var/www/wasm/pymupdf /var/www/wasm/gs /var/www/wasm/cpdf /var/www/wasm/ocr/core /var/www/wasm/ocr/lang-data /var/www/wasm/ocr/fonts tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C /var/www/wasm/pymupdf --strip-components=1 tar xzf bentopdf-gs-wasm-*.tgz -C /var/www/wasm/gs --strip-components=1 tar xzf coherentpdf-*.tgz -C /var/www/wasm/cpdf --strip-components=1 +TEMP_TESS=$(mktemp -d) +tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS" +cp "$TEMP_TESS/package/dist/worker.min.js" /var/www/wasm/ocr/worker.min.js +rm -rf "$TEMP_TESS" +tar xzf tesseract.js-core-7.0.0.tgz -C /var/www/wasm/ocr/core --strip-components=1 +cp ./tesseract-langdata/*.traineddata.gz /var/www/wasm/ocr/lang-data/ +cp ./ocr-fonts/* /var/www/wasm/ocr/fonts/ # Run BentoPDF docker run -d -p 3000:8080 --restart unless-stopped bentopdf ``` +Use the codes printed by `bash scripts/prepare-airgap.sh --list-ocr-languages`, or search by name with `bash scripts/prepare-airgap.sh --search-ocr-language `, for `--ocr-languages`. When you build with a restricted OCR subset, pass the same codes to `VITE_TESSERACT_AVAILABLE_LANGUAGES` so the app only shows bundled languages. For full offline OCR output, also host the bundled `ocr-fonts/` directory and point `VITE_OCR_FONT_BASE_URL` at it. + Set a variable to empty string to disable that module (users must configure manually via Advanced Settings). ## Custom User ID (PUID/PGID) diff --git a/docs/self-hosting/index.md b/docs/self-hosting/index.md index 4149905..3a79e1f 100644 --- a/docs/self-hosting/index.md +++ b/docs/self-hosting/index.md @@ -175,6 +175,11 @@ These are set in `.env.production` and baked into the build: VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/ VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/ VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/ +VITE_TESSERACT_WORKER_URL= +VITE_TESSERACT_CORE_URL= +VITE_TESSERACT_LANG_URL= +VITE_TESSERACT_AVAILABLE_LANGUAGES= +VITE_OCR_FONT_BASE_URL= ``` ### Overriding WASM URLs @@ -187,6 +192,11 @@ docker build \ --build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \ --build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \ --build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \ + --build-arg VITE_TESSERACT_WORKER_URL=https://your-server.com/ocr/worker.min.js \ + --build-arg VITE_TESSERACT_CORE_URL=https://your-server.com/ocr/core \ + --build-arg VITE_TESSERACT_LANG_URL=https://your-server.com/ocr/lang-data \ + --build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \ + --build-arg VITE_OCR_FONT_BASE_URL=https://your-server.com/ocr/fonts \ -t bentopdf . # Or via .env.production before building from source @@ -195,6 +205,8 @@ VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ npm run build To disable a module entirely (require manual user config via Advanced Settings), set its variable to an empty string. +For OCR, either leave all `VITE_TESSERACT_*` variables empty and keep the default online assets, or set the worker/core/lang URLs together for self-hosted/offline OCR. If you bundle only specific OCR languages, also set `VITE_TESSERACT_AVAILABLE_LANGUAGES` to the same comma-separated codes so the UI only offers installed languages and unsupported selections fail with a descriptive error. For fully offline searchable-PDF output, also set `VITE_OCR_FONT_BASE_URL` to the internal directory that serves the bundled OCR fonts. + Users can also override these defaults at any time via **Advanced Settings** in the UI — user overrides stored in the browser take priority over environment defaults. ### Air-Gapped / Offline Deployment @@ -209,6 +221,12 @@ The included `prepare-airgap.sh` script automates the entire process — downloa git clone https://github.com/alam00000/bentopdf.git cd bentopdf +# Show supported OCR language codes (for --ocr-languages) +bash scripts/prepare-airgap.sh --list-ocr-languages + +# Search OCR language codes by name or abbreviation +bash scripts/prepare-airgap.sh --search-ocr-language german + # Interactive mode — prompts for all options bash scripts/prepare-airgap.sh @@ -221,7 +239,9 @@ This produces a bundle directory: ``` bentopdf-airgap-bundle/ bentopdf.tar # Docker image - *.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF) + *.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF, Tesseract) + tesseract-langdata/ # OCR traineddata files + ocr-fonts/ # OCR text-layer font files setup.sh # Setup script for the air-gapped side README.md # Instructions ``` @@ -237,20 +257,25 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta **Script options:** -| Flag | Description | Default | -| ----------------------- | ------------------------------------------------ | --------------------------------- | -| `--wasm-base-url ` | Where WASMs will be hosted internally | _(required, prompted if missing)_ | -| `--image-name ` | Docker image tag | `bentopdf` | -| `--output-dir ` | Output bundle directory | `./bentopdf-airgap-bundle` | -| `--simple-mode` | Enable Simple Mode | off | -| `--base-url ` | Subdirectory base URL (e.g. `/pdf/`) | `/` | -| `--language ` | Default UI language (e.g. `fr`, `de`) | _(none)_ | -| `--brand-name ` | Custom brand name | _(none)_ | -| `--brand-logo ` | Logo path relative to `public/` | _(none)_ | -| `--footer-text ` | Custom footer text | _(none)_ | -| `--dockerfile ` | Dockerfile to use | `Dockerfile` | -| `--skip-docker` | Skip Docker build and export | off | -| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off | +| Flag | Description | Default | +| ------------------------------ | ------------------------------------------------ | --------------------------------- | +| `--wasm-base-url ` | Where WASMs will be hosted internally | _(required, prompted if missing)_ | +| `--image-name ` | Docker image tag | `bentopdf` | +| `--output-dir ` | Output bundle directory | `./bentopdf-airgap-bundle` | +| `--simple-mode` | Enable Simple Mode | off | +| `--base-url ` | Subdirectory base URL (e.g. `/pdf/`) | `/` | +| `--language ` | Default UI language (e.g. `fr`, `de`) | _(none)_ | +| `--brand-name ` | Custom brand name | _(none)_ | +| `--brand-logo ` | Logo path relative to `public/` | _(none)_ | +| `--footer-text ` | Custom footer text | _(none)_ | +| `--ocr-languages ` | Comma-separated OCR languages to bundle | `eng` | +| `--list-ocr-languages` | Print supported OCR codes and names, then exit | off | +| `--search-ocr-language ` | Search OCR codes by name or abbreviation | off | +| `--dockerfile ` | Dockerfile to use | `Dockerfile` | +| `--skip-docker` | Skip Docker build and export | off | +| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off | + +The interactive prompt also accepts `list` to print the full supported Tesseract code list and `search ` to find matches such as `search german` or `search chi`. ::: warning Same-Origin Requirement WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`. @@ -261,12 +286,18 @@ WASM files must be served from the **same origin** as the BentoPDF app. Web Work
If you prefer to do it manually without the script -**Step 1: Download the WASM packages** (on a machine with internet) +**Step 1: Download the WASM and OCR packages** (on a machine with internet) ```bash npm pack @bentopdf/pymupdf-wasm@0.11.14 npm pack @bentopdf/gs-wasm npm pack coherentpdf +npm pack tesseract.js@7.0.0 +npm pack tesseract.js-core@7.0.0 +mkdir -p tesseract-langdata +curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz +mkdir -p ocr-fonts +curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf ``` **Step 2: Build the Docker image with internal URLs** @@ -279,6 +310,10 @@ docker build \ --build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \ --build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \ --build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \ + --build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \ + --build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \ + --build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \ + --build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \ -t bentopdf . ``` @@ -293,7 +328,9 @@ docker save bentopdf -o bentopdf.tar Copy via USB, internal artifact repo, or approved transfer method: - `bentopdf.tar` — the Docker image -- The three `.tgz` WASM packages from Step 1 +- The five `.tgz` WASM/OCR packages from Step 1 +- The `tesseract-langdata/` directory from Step 1 +- The `ocr-fonts/` directory from Step 1 **Step 5: Set up inside the air-gapped network** @@ -302,16 +339,23 @@ Copy via USB, internal artifact repo, or approved transfer method: docker load -i bentopdf.tar # Extract WASM packages -mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf +mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1 tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1 tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1 +TEMP_TESS=$(mktemp -d) +tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS" +cp "$TEMP_TESS/package/dist/worker.min.js" ./wasm/ocr/worker.min.js +rm -rf "$TEMP_TESS" +tar xzf tesseract.js-core-7.0.0.tgz -C ./wasm/ocr/core --strip-components=1 +cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/ +cp ./ocr-fonts/* ./wasm/ocr/fonts/ # Run BentoPDF docker run -d -p 3000:8080 --restart unless-stopped bentopdf ``` -Make sure the WASM files are accessible at the URLs you configured in Step 2. +Make sure the files are accessible at the URLs you configured in Step 2, including `.../ocr/worker.min.js`, `.../ocr/core`, `.../ocr/lang-data`, and `.../ocr/fonts`.
@@ -322,6 +366,10 @@ Set the variables in `.env.production` before running `npm run build`: VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ +VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js +VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core +VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data +VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts ``` ::: diff --git a/scripts/prepare-airgap.sh b/scripts/prepare-airgap.sh index b87d54c..2bae28e 100755 --- a/scripts/prepare-airgap.sh +++ b/scripts/prepare-airgap.sh @@ -13,6 +13,8 @@ set -euo pipefail # Usage: # bash scripts/prepare-airgap.sh --wasm-base-url https://internal.example.com/wasm # bash scripts/prepare-airgap.sh # interactive mode +# bash scripts/prepare-airgap.sh --ocr-languages eng,deu,fra +# bash scripts/prepare-airgap.sh --search-ocr-language german # # See --help for all options. # ============================================================ @@ -54,6 +56,110 @@ DOCKERFILE="Dockerfile" SKIP_DOCKER=false SKIP_WASM=false INTERACTIVE=false +OCR_LANGUAGES="eng" +TESSDATA_VERSION="4.0.0_best_int" +LIST_OCR_LANGUAGES=false +SEARCH_OCR_LANGUAGE_TERM="" + +TESSERACT_LANGUAGE_CONFIG="src/js/config/tesseract-languages.ts" +FONT_MAPPING_CONFIG="src/js/config/font-mappings.ts" + +SUPPORTED_OCR_LANGUAGES_RAW="" +OCR_FONT_MANIFEST_RAW="" + +load_supported_ocr_languages() { + if [ -n "$SUPPORTED_OCR_LANGUAGES_RAW" ]; then + return + fi + + if [ ! -f "$TESSERACT_LANGUAGE_CONFIG" ]; then + error "Missing OCR language config: ${TESSERACT_LANGUAGE_CONFIG}" + exit 1 + fi + + SUPPORTED_OCR_LANGUAGES_RAW=$(node -e "const fs = require('fs'); const source = fs.readFileSync(process.argv[1], 'utf8'); const languages = []; const pattern = /^\\s*([a-z0-9_]+):\\s*'([^']+)'/gm; let match; while ((match = pattern.exec(source)) !== null) { languages.push(match[1] + '\\t' + match[2]); } process.stdout.write(languages.join('\\n'));" "$TESSERACT_LANGUAGE_CONFIG") + + if [ -z "$SUPPORTED_OCR_LANGUAGES_RAW" ]; then + error "Failed to load supported OCR languages from ${TESSERACT_LANGUAGE_CONFIG}" + exit 1 + fi +} + +is_supported_ocr_language() { + local code="$1" + load_supported_ocr_languages + printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' -v code="$code" '$1 == code { found = 1 } END { exit found ? 0 : 1 }' +} + +show_supported_ocr_languages() { + load_supported_ocr_languages + + echo "" + echo -e "${BOLD}Supported OCR languages:${NC}" + echo " Use the code in the left column for --ocr-languages." + echo "" + printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' '{ printf " %-12s %s\n", $1, $2 }' + echo "" + echo " Example: --ocr-languages eng,deu,fra,spa" + echo "" +} + +show_matching_ocr_languages() { + local query="$1" + load_supported_ocr_languages + + if [ -z "$query" ]; then + error "OCR language search requires a non-empty query." + exit 1 + fi + + local matches + matches=$(printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' -v query="$query" ' + BEGIN { + normalized = tolower(query) + } + { + code = tolower($1) + name = tolower($2) + if (index(code, normalized) || index(name, normalized)) { + printf "%s\t%s\n", $1, $2 + } + } + ') + + echo "" + echo -e "${BOLD}OCR language search:${NC} ${query}" + + if [ -z "$matches" ]; then + echo " No supported OCR languages matched that query." + echo " Tip: run --list-ocr-languages to browse the full list." + echo "" + return 1 + fi + + echo " Matching codes for --ocr-languages:" + echo "" + printf '%s\n' "$matches" | awk -F '\t' '{ printf " %-12s %s\n", $1, $2 }' + echo "" +} + +load_required_ocr_fonts() { + if [ -n "$OCR_FONT_MANIFEST_RAW" ]; then + return + fi + + if [ ! -f "$FONT_MAPPING_CONFIG" ]; then + error "Missing OCR font mapping config: ${FONT_MAPPING_CONFIG}" + exit 1 + fi + + OCR_FONT_MANIFEST_RAW=$(node -e "const fs = require('fs'); const source = fs.readFileSync(process.argv[1], 'utf8'); const selected = (process.argv[2] || '').split(',').map((value) => value.trim()).filter(Boolean); const sections = source.split('export const fontFamilyToUrl'); const languageSection = sections[0] || ''; const fontSection = sections[1] || ''; const languageToFamily = {}; const fontFamilyToUrl = {}; let match; const languagePattern = /^\s*([a-z_]+):\s*'([^']+)',/gm; while ((match = languagePattern.exec(languageSection)) !== null) { languageToFamily[match[1]] = match[2]; } const fontPattern = /^\s*'([^']+)':\s*'([^']+)',/gm; while ((match = fontPattern.exec(fontSection)) !== null) { fontFamilyToUrl[match[1]] = match[2]; } const families = new Set(['Noto Sans']); for (const lang of selected) { families.add(languageToFamily[lang] || 'Noto Sans'); } const lines = Array.from(families).sort().map((family) => { const url = fontFamilyToUrl[family] || fontFamilyToUrl['Noto Sans']; const fileName = url.split('/').pop(); return [family, url, fileName].join('\t'); }); process.stdout.write(lines.join('\n'));" "$FONT_MAPPING_CONFIG" "$OCR_LANGUAGES") + + if [ -z "$OCR_FONT_MANIFEST_RAW" ]; then + error "Failed to resolve OCR font assets from ${FONT_MAPPING_CONFIG}" + exit 1 + fi +} # --- Usage --- usage() { @@ -80,6 +186,10 @@ OPTIONS: --brand-name Custom brand name --brand-logo Logo path relative to public/ --footer-text Custom footer text + --ocr-languages Comma-separated OCR languages to bundle + (default: eng) + --list-ocr-languages Print supported OCR language codes and exit + --search-ocr-language Search supported OCR languages by code or name --skip-docker Skip Docker build and export --skip-wasm Skip WASM download (reuse existing .tgz files) --help Show this help message @@ -91,6 +201,7 @@ EXAMPLES: # Full automation bash scripts/prepare-airgap.sh \ --wasm-base-url https://internal.example.com/wasm \ + --ocr-languages eng,deu,fra \ --brand-name "AcmePDF" \ --language fr @@ -98,6 +209,12 @@ EXAMPLES: bash scripts/prepare-airgap.sh \ --wasm-base-url https://internal.example.com/wasm \ --skip-docker + + # Show all supported OCR language codes + bash scripts/prepare-airgap.sh --list-ocr-languages + + # Search OCR languages by code or human-readable name + bash scripts/prepare-airgap.sh --search-ocr-language german EOF exit 0 } @@ -115,6 +232,9 @@ while [[ $# -gt 0 ]]; do --brand-name) BRAND_NAME="$2"; shift 2 ;; --brand-logo) BRAND_LOGO="$2"; shift 2 ;; --footer-text) FOOTER_TEXT="$2"; shift 2 ;; + --ocr-languages) OCR_LANGUAGES="$2"; shift 2 ;; + --list-ocr-languages) LIST_OCR_LANGUAGES=true; shift ;; + --search-ocr-language) SEARCH_OCR_LANGUAGE_TERM="$2"; shift 2 ;; --dockerfile) DOCKERFILE="$2"; shift 2 ;; --skip-docker) SKIP_DOCKER=true; shift ;; --skip-wasm) SKIP_WASM=true; shift ;; @@ -132,6 +252,18 @@ if [ ! -f "package.json" ] || [ ! -f "src/js/const/cdn-version.ts" ]; then exit 1 fi +if [ "$LIST_OCR_LANGUAGES" = true ]; then + show_supported_ocr_languages + exit 0 +fi + +if [ -n "$SEARCH_OCR_LANGUAGE_TERM" ]; then + if show_matching_ocr_languages "$SEARCH_OCR_LANGUAGE_TERM"; then + exit 0 + fi + exit 1 +fi + # --- Check prerequisites --- check_prerequisites() { local missing=false @@ -141,6 +273,11 @@ check_prerequisites() { missing=true fi + if [ "$SKIP_WASM" = false ] && ! command -v curl &>/dev/null; then + error "curl is required to download OCR language data." + missing=true + fi + if [ "$SKIP_DOCKER" = false ] && ! command -v docker &>/dev/null; then error "docker is required but not found (use --skip-docker to skip)." missing=true @@ -156,9 +293,11 @@ read_versions() { PYMUPDF_VERSION=$(grep "pymupdf:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'") GS_VERSION=$(grep "ghostscript:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'") APP_VERSION=$(node -p "require('./package.json').version") + TESSERACT_VERSION=$(node -p "require('./package-lock.json').packages['node_modules/tesseract.js'].version") + TESSERACT_CORE_VERSION=$(node -p "require('./package-lock.json').packages['node_modules/tesseract.js-core'].version") - if [ -z "$PYMUPDF_VERSION" ] || [ -z "$GS_VERSION" ]; then - error "Failed to read WASM versions from src/js/const/cdn-version.ts" + if [ -z "$PYMUPDF_VERSION" ] || [ -z "$GS_VERSION" ] || [ -z "$TESSERACT_VERSION" ] || [ -z "$TESSERACT_CORE_VERSION" ]; then + error "Failed to read external asset versions from the repository metadata" exit 1 fi } @@ -175,6 +314,8 @@ interactive_mode() { echo " PyMuPDF: ${PYMUPDF_VERSION}" echo " Ghostscript: ${GS_VERSION}" echo " CoherentPDF: latest" + echo " Tesseract.js: ${TESSERACT_VERSION}" + echo " OCR Data: ${TESSDATA_VERSION}" echo "" # [1] WASM base URL (REQUIRED) @@ -256,8 +397,35 @@ interactive_mode() { DOCKERFILE="${input:-$DOCKERFILE}" echo "" - # [8] Output directory (optional) - echo -e "${BOLD}[8/8] Output Directory ${GREEN}(optional)${NC}" + # [8] OCR languages (optional) + echo -e "${BOLD}[8/9] OCR Languages ${GREEN}(optional)${NC}" + echo " Comma-separated traineddata files to bundle for offline OCR." + echo " Enter Tesseract language codes such as: eng,deu,fra,spa" + echo " Type 'list' to print the full supported language list." + echo " Type 'search ' to find codes by name or abbreviation." + while true; do + read -r -p " OCR languages [${OCR_LANGUAGES}]: " input + if [ -z "${input:-}" ]; then + break + fi + if [ "$input" = "list" ]; then + show_supported_ocr_languages + continue + fi + if [[ "$input" == search\ * ]]; then + search_query="${input#search }" + if ! show_matching_ocr_languages "$search_query"; then + warn "No OCR language matched '${search_query}'." + fi + continue + fi + OCR_LANGUAGES="$input" + break + done + echo "" + + # [9] Output directory (optional) + echo -e "${BOLD}[9/9] Output Directory ${GREEN}(optional)${NC}" read -r -p " Path [${OUTPUT_DIR}]: " input OUTPUT_DIR="${input:-$OUTPUT_DIR}" @@ -274,6 +442,7 @@ interactive_mode() { [ -n "$BRAND_NAME" ] && echo " Brand Logo: ${BRAND_LOGO:-images/favicon-no-bg.svg (default)}" [ -n "$BRAND_NAME" ] && echo " Footer Text: ${FOOTER_TEXT:-(default)}" echo " Base URL: ${BASE_URL:-/ (root)}" + echo " OCR Languages: ${OCR_LANGUAGES}" echo " Output: ${OUTPUT_DIR}" echo "" read -r -p " Proceed? (Y/n): " input @@ -321,6 +490,7 @@ filesize() { check_prerequisites read_versions +load_supported_ocr_languages # If no WASM base URL provided, go interactive if [ -z "$WASM_BASE_URL" ]; then @@ -338,6 +508,34 @@ if [ -n "$LANGUAGE" ]; then fi fi +IFS=',' read -r -a OCR_LANGUAGE_ARRAY <<< "$OCR_LANGUAGES" +NORMALIZED_OCR_LANGUAGES=() +for raw_lang in "${OCR_LANGUAGE_ARRAY[@]}"; do + lang=$(echo "$raw_lang" | tr -d '[:space:]') + if [ -z "$lang" ]; then + continue + fi + if [[ ! "$lang" =~ ^[a-z0-9_]+$ ]]; then + error "Invalid OCR language code: ${lang}" + error "Use comma-separated Tesseract codes such as eng,deu,fra,chi_sim" + exit 1 + fi + if ! is_supported_ocr_language "$lang"; then + error "Unsupported OCR language code: ${lang}" + error "Run with --list-ocr-languages or --search-ocr-language to find supported Tesseract codes." + exit 1 + fi + NORMALIZED_OCR_LANGUAGES+=("$lang") +done + +if [ ${#NORMALIZED_OCR_LANGUAGES[@]} -eq 0 ]; then + error "At least one OCR language must be included." + exit 1 +fi + +OCR_LANGUAGES=$(IFS=','; echo "${NORMALIZED_OCR_LANGUAGES[*]}") +load_required_ocr_fonts + # Validate WASM base URL format if [[ ! "$WASM_BASE_URL" =~ ^https?:// ]]; then error "WASM base URL must start with http:// or https://" @@ -353,11 +551,15 @@ WASM_BASE_URL="${WASM_BASE_URL%/}" WASM_PYMUPDF_URL="${WASM_BASE_URL}/pymupdf/" WASM_GS_URL="${WASM_BASE_URL}/gs/" WASM_CPDF_URL="${WASM_BASE_URL}/cpdf/" +OCR_TESSERACT_WORKER_URL="${WASM_BASE_URL}/ocr/worker.min.js" +OCR_TESSERACT_CORE_URL="${WASM_BASE_URL}/ocr/core" +OCR_TESSERACT_LANG_URL="${WASM_BASE_URL}/ocr/lang-data" +OCR_FONT_BASE_URL="${WASM_BASE_URL}/ocr/fonts" echo "" echo -e "${BOLD}============================================================${NC}" echo -e "${BOLD} BentoPDF Air-Gapped Bundle Preparation${NC}" -echo -e "${BOLD} App: v${APP_VERSION} | PyMuPDF: ${PYMUPDF_VERSION} | GS: ${GS_VERSION}${NC}" +echo -e "${BOLD} App: v${APP_VERSION} | PyMuPDF: ${PYMUPDF_VERSION} | GS: ${GS_VERSION} | OCR: ${TESSERACT_VERSION}${NC}" echo -e "${BOLD}============================================================${NC}" # --- Phase 1: Prepare output directory --- @@ -398,6 +600,27 @@ if [ "$SKIP_WASM" = true ]; then error "Missing: coherentpdf-*.tgz" wasm_missing=true fi + if ! ls "$OUTPUT_DIR"/tesseract.js-*.tgz &>/dev/null; then + error "Missing: tesseract.js-*.tgz" + wasm_missing=true + fi + if ! ls "$OUTPUT_DIR"/tesseract.js-core-*.tgz &>/dev/null; then + error "Missing: tesseract.js-core-*.tgz" + wasm_missing=true + fi + for lang in "${NORMALIZED_OCR_LANGUAGES[@]}"; do + if [ ! -f "$OUTPUT_DIR/tesseract-langdata/${lang}.traineddata.gz" ]; then + error "Missing: tesseract-langdata/${lang}.traineddata.gz" + wasm_missing=true + fi + done + while IFS=$'\t' read -r font_family font_url font_file; do + [ -z "$font_file" ] && continue + if [ ! -f "$OUTPUT_DIR/ocr-fonts/${font_file}" ]; then + error "Missing: ocr-fonts/${font_file} (${font_family})" + wasm_missing=true + fi + done <<< "$OCR_FONT_MANIFEST_RAW" if [ "$wasm_missing" = true ]; then error "Run without --skip-wasm first to download the packages." exit 1 @@ -430,8 +653,42 @@ else exit 1 fi + info "Downloading tesseract.js@${TESSERACT_VERSION}..." + if ! (cd "$WASM_TMP" && npm pack "tesseract.js@${TESSERACT_VERSION}" --quiet 2>&1); then + error "Failed to download tesseract.js@${TESSERACT_VERSION}" + exit 1 + fi + + info "Downloading tesseract.js-core@${TESSERACT_CORE_VERSION}..." + if ! (cd "$WASM_TMP" && npm pack "tesseract.js-core@${TESSERACT_CORE_VERSION}" --quiet 2>&1); then + error "Failed to download tesseract.js-core@${TESSERACT_CORE_VERSION}" + exit 1 + fi + # Move to output directory mv "$WASM_TMP"/*.tgz "$OUTPUT_DIR/" + + mkdir -p "$OUTPUT_DIR/tesseract-langdata" + for lang in "${NORMALIZED_OCR_LANGUAGES[@]}"; do + info "Downloading OCR language data: ${lang}..." + if ! curl -fsSL "https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/${TESSDATA_VERSION}/${lang}.traineddata.gz" -o "$OUTPUT_DIR/tesseract-langdata/${lang}.traineddata.gz"; then + error "Failed to download OCR language data for ${lang}" + error "Check that the language code exists and that the network can reach jsDelivr." + exit 1 + fi + done + + mkdir -p "$OUTPUT_DIR/ocr-fonts" + while IFS=$'\t' read -r font_family font_url font_file; do + [ -z "$font_file" ] && continue + info "Downloading OCR font: ${font_family}..." + if ! curl -fsSL "$font_url" -o "$OUTPUT_DIR/ocr-fonts/${font_file}"; then + error "Failed to download OCR font '${font_family}'" + error "Check that the network can reach the font URL: ${font_url}" + exit 1 + fi + done <<< "$OCR_FONT_MANIFEST_RAW" + rm -rf "$WASM_TMP" trap - EXIT @@ -443,6 +700,10 @@ else info " PyMuPDF: $(filesize "$OUTPUT_DIR"/bentopdf-pymupdf-wasm-*.tgz)" info " Ghostscript: $(filesize "$OUTPUT_DIR"/bentopdf-gs-wasm-*.tgz)" info " CoherentPDF: $(filesize "$CPDF_TGZ") (v${CPDF_VERSION})" + info " Tesseract.js: $(filesize "$OUTPUT_DIR"/tesseract.js-*.tgz)" + info " OCR Core: $(filesize "$OUTPUT_DIR"/tesseract.js-core-*.tgz)" + info " OCR Langs: ${OCR_LANGUAGES}" + info " OCR Fonts: $(printf '%s\n' "$OCR_FONT_MANIFEST_RAW" | awk -F '\t' 'NF >= 1 { print $1 }' | paste -sd ', ' -)" fi # Resolve CPDF version if we skipped download @@ -488,6 +749,11 @@ else BUILD_ARGS+=(--build-arg "VITE_WASM_PYMUPDF_URL=${WASM_PYMUPDF_URL}") BUILD_ARGS+=(--build-arg "VITE_WASM_GS_URL=${WASM_GS_URL}") BUILD_ARGS+=(--build-arg "VITE_WASM_CPDF_URL=${WASM_CPDF_URL}") + BUILD_ARGS+=(--build-arg "VITE_TESSERACT_WORKER_URL=${OCR_TESSERACT_WORKER_URL}") + BUILD_ARGS+=(--build-arg "VITE_TESSERACT_CORE_URL=${OCR_TESSERACT_CORE_URL}") + BUILD_ARGS+=(--build-arg "VITE_TESSERACT_LANG_URL=${OCR_TESSERACT_LANG_URL}") + BUILD_ARGS+=(--build-arg "VITE_TESSERACT_AVAILABLE_LANGUAGES=${OCR_LANGUAGES}") + BUILD_ARGS+=(--build-arg "VITE_OCR_FONT_BASE_URL=${OCR_FONT_BASE_URL}") [ -n "$SIMPLE_MODE" ] && BUILD_ARGS+=(--build-arg "SIMPLE_MODE=${SIMPLE_MODE}") [ -n "$BASE_URL" ] && BUILD_ARGS+=(--build-arg "BASE_URL=${BASE_URL}") @@ -503,6 +769,12 @@ else info " PyMuPDF: ${WASM_PYMUPDF_URL}" info " Ghostscript: ${WASM_GS_URL}" info " CoherentPDF: ${WASM_CPDF_URL}" + info "OCR URLs:" + info " Worker: ${OCR_TESSERACT_WORKER_URL}" + info " Core: ${OCR_TESSERACT_CORE_URL}" + info " Lang Data: ${OCR_TESSERACT_LANG_URL}" + info " Font Base: ${OCR_FONT_BASE_URL}" + info " Languages: ${OCR_LANGUAGES}" echo "" info "Building... this may take a few minutes (npm install + Vite build)." echo "" @@ -582,7 +854,7 @@ fi echo "" echo "[2/3] Extracting WASM packages to \${WASM_DIR}..." -mkdir -p "\${WASM_DIR}/pymupdf" "\${WASM_DIR}/gs" "\${WASM_DIR}/cpdf" +mkdir -p "\${WASM_DIR}/pymupdf" "\${WASM_DIR}/gs" "\${WASM_DIR}/cpdf" "\${WASM_DIR}/ocr/core" "\${WASM_DIR}/ocr/lang-data" "\${WASM_DIR}/ocr/fonts" # PyMuPDF: package has dist/ and assets/ at root echo " Extracting PyMuPDF..." @@ -610,12 +882,35 @@ else fi rm -rf "\${TEMP_CPDF}" +# Tesseract worker: browser expects a single worker.min.js file +echo " Extracting Tesseract worker..." +TEMP_TESS="\$(mktemp -d)" +tar xzf "\${SCRIPT_DIR}"/tesseract.js-*.tgz -C "\${TEMP_TESS}" +cp "\${TEMP_TESS}/package/dist/worker.min.js" "\${WASM_DIR}/ocr/worker.min.js" +rm -rf "\${TEMP_TESS}" + +# Tesseract core: browser expects the full tesseract.js-core directory +echo " Extracting Tesseract core..." +tar xzf "\${SCRIPT_DIR}"/tesseract.js-core-*.tgz -C "\${WASM_DIR}/ocr/core" --strip-components=1 + +# OCR language data: copy the bundled traineddata files +echo " Installing OCR language data..." +cp "\${SCRIPT_DIR}"/tesseract-langdata/*.traineddata.gz "\${WASM_DIR}/ocr/lang-data/" + +# OCR fonts: copy the bundled font files for searchable text layer rendering +echo " Installing OCR fonts..." +cp "\${SCRIPT_DIR}"/ocr-fonts/* "\${WASM_DIR}/ocr/fonts/" + echo " WASM files extracted to: \${WASM_DIR}" echo "" echo " IMPORTANT: Ensure these paths are served by your internal web server:" echo " \${WASM_BASE_URL}/pymupdf/ -> \${WASM_DIR}/pymupdf/" echo " \${WASM_BASE_URL}/gs/ -> \${WASM_DIR}/gs/" echo " \${WASM_BASE_URL}/cpdf/ -> \${WASM_DIR}/cpdf/" +echo " \${WASM_BASE_URL}/ocr/worker.min.js -> \${WASM_DIR}/ocr/worker.min.js" +echo " \${WASM_BASE_URL}/ocr/core -> \${WASM_DIR}/ocr/core/" +echo " \${WASM_BASE_URL}/ocr/lang-data -> \${WASM_DIR}/ocr/lang-data/" +echo " \${WASM_BASE_URL}/ocr/fonts -> \${WASM_DIR}/ocr/fonts/" # --- Step 3: Start BentoPDF --- echo "" @@ -654,6 +949,10 @@ cat > "$OUTPUT_DIR/README.md" <= 3 { printf "- **%s** -> `%s`\n", $1, $3 }') These URLs are baked into the app at build time. The user's browser fetches WASM files from these URLs at runtime. @@ -694,7 +1003,7 @@ docker load -i bentopdf.tar Extract to your internal web server's document root: \`\`\`bash -mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf +mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts # PyMuPDF tar xzf bentopdf-pymupdf-wasm-${PYMUPDF_VERSION}.tgz -C ./wasm/pymupdf --strip-components=1 @@ -710,6 +1019,21 @@ TEMP_CPDF=\$(mktemp -d) tar xzf coherentpdf-${CPDF_VERSION}.tgz -C \$TEMP_CPDF cp -r \$TEMP_CPDF/package/dist/* ./wasm/cpdf/ rm -rf \$TEMP_CPDF + +# Tesseract worker +TEMP_TESS=\$(mktemp -d) +tar xzf tesseract.js-${TESSERACT_VERSION}.tgz -C \$TEMP_TESS +cp \$TEMP_TESS/package/dist/worker.min.js ./wasm/ocr/worker.min.js +rm -rf \$TEMP_TESS + +# Tesseract core +tar xzf tesseract.js-core-${TESSERACT_CORE_VERSION}.tgz -C ./wasm/ocr/core --strip-components=1 + +# OCR language data +cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/ + +# OCR fonts +cp ./ocr-fonts/* ./wasm/ocr/fonts/ \`\`\` ### 3. Configure your web server @@ -721,6 +1045,10 @@ Ensure these paths are accessible at the configured URLs: | \`${WASM_PYMUPDF_URL}\` | \`./wasm/pymupdf/\` | | \`${WASM_GS_URL}\` | \`./wasm/gs/\` | | \`${WASM_CPDF_URL}\` | \`./wasm/cpdf/\` | +| \`${OCR_TESSERACT_WORKER_URL}\` | \`./wasm/ocr/worker.min.js\` | +| \`${OCR_TESSERACT_CORE_URL}\` | \`./wasm/ocr/core/\` | +| \`${OCR_TESSERACT_LANG_URL}\` | \`./wasm/ocr/lang-data/\` | +| \`${OCR_FONT_BASE_URL}\` | \`./wasm/ocr/fonts/\` | ### 4. Run BentoPDF diff --git a/src/js/compare/engine/ocr-page.ts b/src/js/compare/engine/ocr-page.ts index 5c229c7..40abf00 100644 --- a/src/js/compare/engine/ocr-page.ts +++ b/src/js/compare/engine/ocr-page.ts @@ -1,37 +1,39 @@ -import Tesseract from 'tesseract.js'; - +import type Tesseract from 'tesseract.js'; import type { ComparePageModel, CompareTextItem } from '../types.ts'; import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts'; import { joinCompareTextItems, normalizeCompareText, } from './text-normalization.ts'; +import { createConfiguredTesseractWorker } from '../../utils/tesseract-runtime.js'; -type OcrWord = { - text: string; - bbox: { - x0: number; - y0: number; - x1: number; - y1: number; - }; -}; +type OcrWord = Tesseract.Word; +type OcrRecognizeResult = Tesseract.RecognizeResult; +type OcrPageWithWords = Tesseract.Page & { words: OcrWord[] }; export async function recognizePageCanvas( canvas: HTMLCanvasElement, language: string, onProgress?: (status: string, progress: number) => void ): Promise { - const result = await Tesseract.recognize(canvas, language, { - logger(message) { + const worker = await createConfiguredTesseractWorker( + language, + 1, + (message) => { onProgress?.(message.status, message.progress || 0); - }, - }); + } + ); - const ocrData = result.data as unknown as { words?: OcrWord[] }; - const words = ((ocrData.words || []) as OcrWord[]) + let result: OcrRecognizeResult; + try { + result = await worker.recognize(canvas); + } finally { + await worker.terminate(); + } + + const words = (result.data as OcrPageWithWords).words .map((word, index) => { - const normalizedText = normalizeCompareText(word.text || ''); + const normalizedText = normalizeCompareText(word.text); if (!normalizedText) return null; const item: CompareTextItem = { diff --git a/src/js/config/font-mappings.ts b/src/js/config/font-mappings.ts index c6c0c31..6a3df53 100644 --- a/src/js/config/font-mappings.ts +++ b/src/js/config/font-mappings.ts @@ -1,189 +1,233 @@ -/** - * Font mappings for OCR text layer rendering - * Maps Tesseract language codes to appropriate Noto Sans font families and their CDN URLs - */ - -export const languageToFontFamily: Record = { - // CJK Languages - jpn: 'Noto Sans JP', - chi_sim: 'Noto Sans SC', - chi_tra: 'Noto Sans TC', - kor: 'Noto Sans KR', - - // Arabic Script - ara: 'Noto Sans Arabic', - fas: 'Noto Sans Arabic', - urd: 'Noto Sans Arabic', - pus: 'Noto Sans Arabic', - kur: 'Noto Sans Arabic', - - // Devanagari Script - hin: 'Noto Sans Devanagari', - mar: 'Noto Sans Devanagari', - san: 'Noto Sans Devanagari', - nep: 'Noto Sans Devanagari', - - // Bengali Script - ben: 'Noto Sans Bengali', - asm: 'Noto Sans Bengali', - - // Tamil Script - tam: 'Noto Sans Tamil', - - // Telugu Script - tel: 'Noto Sans Telugu', - - // Kannada Script - kan: 'Noto Sans Kannada', - - // Malayalam Script - mal: 'Noto Sans Malayalam', - - // Gujarati Script - guj: 'Noto Sans Gujarati', - - // Gurmukhi Script (Punjabi) - pan: 'Noto Sans Gurmukhi', - - // Oriya Script - ori: 'Noto Sans Oriya', - - // Sinhala Script - sin: 'Noto Sans Sinhala', - - // Thai Script - tha: 'Noto Sans Thai', - - // Lao Script - lao: 'Noto Sans Lao', - - // Khmer Script - khm: 'Noto Sans Khmer', - - // Myanmar Script - mya: 'Noto Sans Myanmar', - - // Tibetan Script - bod: 'Noto Serif Tibetan', - - // Georgian Script - kat: 'Noto Sans Georgian', - kat_old: 'Noto Sans Georgian', - - // Armenian Script - hye: 'Noto Sans Armenian', - - // Hebrew Script - heb: 'Noto Sans Hebrew', - yid: 'Noto Sans Hebrew', - - // Ethiopic Script - amh: 'Noto Sans Ethiopic', - tir: 'Noto Sans Ethiopic', - - // Cherokee Script - chr: 'Noto Sans Cherokee', - - // Syriac Script - syr: 'Noto Sans Syriac', - - // Cyrillic Script (Noto Sans includes Cyrillic) - bel: 'Noto Sans', - bul: 'Noto Sans', - mkd: 'Noto Sans', - rus: 'Noto Sans', - srp: 'Noto Sans', - srp_latn: 'Noto Sans', - ukr: 'Noto Sans', - kaz: 'Noto Sans', - kir: 'Noto Sans', - tgk: 'Noto Sans', - uzb: 'Noto Sans', - uzb_cyrl: 'Noto Sans', - aze_cyrl: 'Noto Sans', - - // Latin Script (covered by base Noto Sans) - afr: 'Noto Sans', - aze: 'Noto Sans', - bos: 'Noto Sans', - cat: 'Noto Sans', - ceb: 'Noto Sans', - ces: 'Noto Sans', - cym: 'Noto Sans', - dan: 'Noto Sans', - deu: 'Noto Sans', - ell: 'Noto Sans', - eng: 'Noto Sans', - enm: 'Noto Sans', - epo: 'Noto Sans', - est: 'Noto Sans', - eus: 'Noto Sans', - fin: 'Noto Sans', - fra: 'Noto Sans', - frk: 'Noto Sans', - frm: 'Noto Sans', - gle: 'Noto Sans', - glg: 'Noto Sans', - grc: 'Noto Sans', - hat: 'Noto Sans', - hrv: 'Noto Sans', - hun: 'Noto Sans', - iku: 'Noto Sans', - ind: 'Noto Sans', - isl: 'Noto Sans', - ita: 'Noto Sans', - ita_old: 'Noto Sans', - jav: 'Noto Sans', - lat: 'Noto Sans', - lav: 'Noto Sans', - lit: 'Noto Sans', - mlt: 'Noto Sans', - msa: 'Noto Sans', - nld: 'Noto Sans', - nor: 'Noto Sans', - pol: 'Noto Sans', - por: 'Noto Sans', - ron: 'Noto Sans', - slk: 'Noto Sans', - slv: 'Noto Sans', - spa: 'Noto Sans', - spa_old: 'Noto Sans', - sqi: 'Noto Sans', - swa: 'Noto Sans', - swe: 'Noto Sans', - tgl: 'Noto Sans', - tur: 'Noto Sans', - vie: 'Noto Sans', - dzo: 'Noto Sans', - uig: 'Noto Sans', -}; - -export const fontFamilyToUrl: Record = { - 'Noto Sans JP': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf', - 'Noto Sans SC': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf', - 'Noto Sans TC': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf', - 'Noto Sans KR': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Korean/NotoSansCJKkr-Regular.otf', - 'Noto Sans Arabic': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArabic/NotoSansArabic-Regular.ttf', - 'Noto Sans Devanagari': 'https://raw.githack.com/googlefonts/noto-fonts/main/unhinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf', - 'Noto Sans Bengali': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf', - 'Noto Sans Gujarati': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGujarati/NotoSansGujarati-Regular.ttf', - 'Noto Sans Kannada': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKannada/NotoSansKannada-Regular.ttf', - 'Noto Sans Malayalam': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMalayalam/NotoSansMalayalam-Regular.ttf', - 'Noto Sans Oriya': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansOriya/NotoSansOriya-Regular.ttf', - 'Noto Sans Gurmukhi': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGurmukhi/NotoSansGurmukhi-Regular.ttf', - 'Noto Sans Tamil': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTamil/NotoSansTamil-Regular.ttf', - 'Noto Sans Telugu': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTelugu/NotoSansTelugu-Regular.ttf', - 'Noto Sans Sinhala': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSinhala/NotoSansSinhala-Regular.ttf', - 'Noto Sans Thai': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf', - 'Noto Sans Khmer': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKhmer/NotoSansKhmer-Regular.ttf', - 'Noto Sans Lao': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansLao/NotoSansLao-Regular.ttf', - 'Noto Sans Myanmar': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMyanmar/NotoSansMyanmar-Regular.ttf', - 'Noto Sans Hebrew': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansHebrew/NotoSansHebrew-Regular.ttf', - 'Noto Sans Georgian': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGeorgian/NotoSansGeorgian-Regular.ttf', - 'Noto Sans Ethiopic': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansEthiopic/NotoSansEthiopic-Regular.ttf', - 'Noto Serif Tibetan': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf', - 'Noto Sans Cherokee': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansCherokee/NotoSansCherokee-Regular.ttf', - 'Noto Sans Armenian': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArmenian/NotoSansArmenian-Regular.ttf', - 'Noto Sans Syriac': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSyriac/NotoSansSyriac-Regular.ttf', - 'Noto Sans': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf', -}; \ No newline at end of file +/** + * Font mappings for OCR text layer rendering + * Maps Tesseract language codes to appropriate Noto Sans font families and their CDN URLs + */ + +export const languageToFontFamily: Record = { + // CJK Languages + jpn: 'Noto Sans JP', + chi_sim: 'Noto Sans SC', + chi_tra: 'Noto Sans TC', + kor: 'Noto Sans KR', + + // Arabic Script + ara: 'Noto Sans Arabic', + fas: 'Noto Sans Arabic', + urd: 'Noto Sans Arabic', + pus: 'Noto Sans Arabic', + kur: 'Noto Sans Arabic', + + // Devanagari Script + hin: 'Noto Sans Devanagari', + mar: 'Noto Sans Devanagari', + san: 'Noto Sans Devanagari', + nep: 'Noto Sans Devanagari', + + // Bengali Script + ben: 'Noto Sans Bengali', + asm: 'Noto Sans Bengali', + + // Tamil Script + tam: 'Noto Sans Tamil', + + // Telugu Script + tel: 'Noto Sans Telugu', + + // Kannada Script + kan: 'Noto Sans Kannada', + + // Malayalam Script + mal: 'Noto Sans Malayalam', + + // Gujarati Script + guj: 'Noto Sans Gujarati', + + // Gurmukhi Script (Punjabi) + pan: 'Noto Sans Gurmukhi', + + // Oriya Script + ori: 'Noto Sans Oriya', + + // Sinhala Script + sin: 'Noto Sans Sinhala', + + // Thai Script + tha: 'Noto Sans Thai', + + // Lao Script + lao: 'Noto Sans Lao', + + // Khmer Script + khm: 'Noto Sans Khmer', + + // Myanmar Script + mya: 'Noto Sans Myanmar', + + // Tibetan Script + bod: 'Noto Serif Tibetan', + + // Georgian Script + kat: 'Noto Sans Georgian', + kat_old: 'Noto Sans Georgian', + + // Armenian Script + hye: 'Noto Sans Armenian', + + // Hebrew Script + heb: 'Noto Sans Hebrew', + yid: 'Noto Sans Hebrew', + + // Ethiopic Script + amh: 'Noto Sans Ethiopic', + tir: 'Noto Sans Ethiopic', + + // Cherokee Script + chr: 'Noto Sans Cherokee', + + // Syriac Script + syr: 'Noto Sans Syriac', + + // Cyrillic Script (Noto Sans includes Cyrillic) + bel: 'Noto Sans', + bul: 'Noto Sans', + mkd: 'Noto Sans', + rus: 'Noto Sans', + srp: 'Noto Sans', + srp_latn: 'Noto Sans', + ukr: 'Noto Sans', + kaz: 'Noto Sans', + kir: 'Noto Sans', + tgk: 'Noto Sans', + uzb: 'Noto Sans', + uzb_cyrl: 'Noto Sans', + aze_cyrl: 'Noto Sans', + + // Latin Script (covered by base Noto Sans) + afr: 'Noto Sans', + aze: 'Noto Sans', + bos: 'Noto Sans', + cat: 'Noto Sans', + ceb: 'Noto Sans', + ces: 'Noto Sans', + cym: 'Noto Sans', + dan: 'Noto Sans', + deu: 'Noto Sans', + ell: 'Noto Sans', + eng: 'Noto Sans', + enm: 'Noto Sans', + epo: 'Noto Sans', + est: 'Noto Sans', + eus: 'Noto Sans', + fin: 'Noto Sans', + fra: 'Noto Sans', + frk: 'Noto Sans', + frm: 'Noto Sans', + gle: 'Noto Sans', + glg: 'Noto Sans', + grc: 'Noto Sans', + hat: 'Noto Sans', + hrv: 'Noto Sans', + hun: 'Noto Sans', + iku: 'Noto Sans', + ind: 'Noto Sans', + isl: 'Noto Sans', + ita: 'Noto Sans', + ita_old: 'Noto Sans', + jav: 'Noto Sans', + lat: 'Noto Sans', + lav: 'Noto Sans', + lit: 'Noto Sans', + mlt: 'Noto Sans', + msa: 'Noto Sans', + nld: 'Noto Sans', + nor: 'Noto Sans', + pol: 'Noto Sans', + por: 'Noto Sans', + ron: 'Noto Sans', + slk: 'Noto Sans', + slv: 'Noto Sans', + spa: 'Noto Sans', + spa_old: 'Noto Sans', + sqi: 'Noto Sans', + swa: 'Noto Sans', + swe: 'Noto Sans', + tgl: 'Noto Sans', + tur: 'Noto Sans', + vie: 'Noto Sans', + dzo: 'Noto Sans', + uig: 'Noto Sans', +}; + +export const fontFamilyToUrl: Record = { + 'Noto Sans JP': + 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf', + 'Noto Sans SC': + 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf', + 'Noto Sans TC': + 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf', + 'Noto Sans KR': + 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Korean/NotoSansCJKkr-Regular.otf', + 'Noto Sans Arabic': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArabic/NotoSansArabic-Regular.ttf', + 'Noto Sans Devanagari': + 'https://raw.githack.com/googlefonts/noto-fonts/main/unhinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf', + 'Noto Sans Bengali': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf', + 'Noto Sans Gujarati': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGujarati/NotoSansGujarati-Regular.ttf', + 'Noto Sans Kannada': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKannada/NotoSansKannada-Regular.ttf', + 'Noto Sans Malayalam': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMalayalam/NotoSansMalayalam-Regular.ttf', + 'Noto Sans Oriya': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansOriya/NotoSansOriya-Regular.ttf', + 'Noto Sans Gurmukhi': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGurmukhi/NotoSansGurmukhi-Regular.ttf', + 'Noto Sans Tamil': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTamil/NotoSansTamil-Regular.ttf', + 'Noto Sans Telugu': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTelugu/NotoSansTelugu-Regular.ttf', + 'Noto Sans Sinhala': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSinhala/NotoSansSinhala-Regular.ttf', + 'Noto Sans Thai': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf', + 'Noto Sans Khmer': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKhmer/NotoSansKhmer-Regular.ttf', + 'Noto Sans Lao': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansLao/NotoSansLao-Regular.ttf', + 'Noto Sans Myanmar': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMyanmar/NotoSansMyanmar-Regular.ttf', + 'Noto Sans Hebrew': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansHebrew/NotoSansHebrew-Regular.ttf', + 'Noto Sans Georgian': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGeorgian/NotoSansGeorgian-Regular.ttf', + 'Noto Sans Ethiopic': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansEthiopic/NotoSansEthiopic-Regular.ttf', + 'Noto Serif Tibetan': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf', + 'Noto Sans Cherokee': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansCherokee/NotoSansCherokee-Regular.ttf', + 'Noto Sans Armenian': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArmenian/NotoSansArmenian-Regular.ttf', + 'Noto Sans Syriac': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSyriac/NotoSansSyriac-Regular.ttf', + 'Noto Sans': + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf', +}; + +export function getFontUrlForFamily(fontFamily: string): string { + return fontFamilyToUrl[fontFamily] || fontFamilyToUrl['Noto Sans']; +} + +export function getFontAssetFileName(fontFamily: string): string { + const defaultUrl = getFontUrlForFamily(fontFamily); + const fileName = defaultUrl.split('/').pop(); + + if (!fileName) { + throw new Error( + `Could not resolve a font asset filename for ${fontFamily}` + ); + } + + return fileName; +} diff --git a/src/js/logic/ocr-pdf-page.ts b/src/js/logic/ocr-pdf-page.ts index 04341f7..1f8318c 100644 --- a/src/js/logic/ocr-pdf-page.ts +++ b/src/js/logic/ocr-pdf-page.ts @@ -4,6 +4,11 @@ import { downloadFile, formatBytes } from '../utils/helpers.js'; import { icons, createIcons } from 'lucide'; import { OcrState } from '@/types'; import { performOcr } from '../utils/ocr.js'; +import { + getAvailableTesseractLanguageEntries, + resolveConfiguredTesseractAvailableLanguages, + UnsupportedOcrLanguageError, +} from '../utils/tesseract-language-availability.js'; const pageState: OcrState = { file: null, @@ -80,6 +85,30 @@ function resetState() { if (processBtn) processBtn.disabled = true; } +function updateLanguageAvailabilityNotice() { + const notice = document.getElementById('lang-availability-note'); + if (!notice) return; + + const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(); + if (!configuredLanguages) { + notice.classList.add('hidden'); + notice.textContent = ''; + return; + } + + const availableEntries = getAvailableTesseractLanguageEntries(); + if (availableEntries.length === 0) { + notice.classList.remove('hidden'); + notice.textContent = + 'This deployment does not expose any valid OCR languages. Rebuild it with VITE_TESSERACT_AVAILABLE_LANGUAGES set to valid Tesseract codes.'; + return; + } + + const availableNames = availableEntries.map(([, name]) => name).join(', '); + notice.classList.remove('hidden'); + notice.textContent = `This deployment bundles OCR for: ${availableNames}.`; +} + async function runOCR() { const selectedLangs = Array.from( document.querySelectorAll('.lang-checkbox:checked') @@ -142,10 +171,14 @@ async function runOCR() { if (textOutput) textOutput.value = result.fullText.trim(); } catch (e) { console.error(e); - showAlert( - 'OCR Error', - 'An error occurred during the OCR process. The worker may have failed to load. Please try again.' - ); + if (e instanceof UnsupportedOcrLanguageError) { + showAlert('OCR Language Not Available', e.message); + } else { + showAlert( + 'OCR Error', + 'An error occurred during the OCR process. The worker may have failed to load. Please try again.' + ); + } if (toolOptions) toolOptions.classList.remove('hidden'); if (ocrProgress) ocrProgress.classList.add('hidden'); } @@ -213,10 +246,21 @@ function populateLanguageList() { langList.innerHTML = ''; - Object.entries(tesseractLanguages).forEach(function ([code, name]) { + const availableEntries = getAvailableTesseractLanguageEntries(); + if (availableEntries.length === 0) { + const emptyState = document.createElement('p'); + emptyState.className = 'text-sm text-yellow-300 p-2'; + emptyState.textContent = + 'No OCR languages are available in this deployment.'; + langList.appendChild(emptyState); + return; + } + + availableEntries.forEach(function ([code, name]) { const label = document.createElement('label'); label.className = 'flex items-center gap-2 p-2 rounded-md hover:bg-gray-700 cursor-pointer'; + label.dataset.search = `${name} ${code}`.toLowerCase(); const checkbox = document.createElement('input'); checkbox.type = 'checkbox'; @@ -253,6 +297,7 @@ document.addEventListener('DOMContentLoaded', function () { const downloadPdfBtn = document.getElementById('download-searchable-pdf'); populateLanguageList(); + updateLanguageAvailabilityNotice(); if (backBtn) { backBtn.addEventListener('click', function () { @@ -304,9 +349,9 @@ document.addEventListener('DOMContentLoaded', function () { langSearch.addEventListener('input', function () { const searchTerm = langSearch.value.toLowerCase(); langList.querySelectorAll('label').forEach(function (label) { - (label as HTMLElement).style.display = label.textContent - ?.toLowerCase() - .includes(searchTerm) + (label as HTMLElement).style.display = ( + label as HTMLElement + ).dataset.search?.includes(searchTerm) ? '' : 'none'; }); diff --git a/src/js/logic/pdf-workflow-page.ts b/src/js/logic/pdf-workflow-page.ts index c4ab6bb..7bac637 100644 --- a/src/js/logic/pdf-workflow-page.ts +++ b/src/js/logic/pdf-workflow-page.ts @@ -1,7 +1,7 @@ import { showAlert } from '../ui.js'; -import { tesseractLanguages } from '../config/tesseract-languages.js'; import { createWorkflowEditor, updateNodeDisplay } from '../workflow/editor'; import { executeWorkflow } from '../workflow/engine'; +import { getAvailableTesseractLanguageEntries } from '../utils/tesseract-language-availability.js'; import { nodeRegistry, getNodesByCategory, @@ -1194,7 +1194,7 @@ function showNodeSettings(node: BaseWorkflowNode) { { label: 'High (288 DPI)', value: '3.0' }, { label: 'Ultra (384 DPI)', value: '4.0' }, ], - language: Object.entries(tesseractLanguages).map(([code, name]) => ({ + language: getAvailableTesseractLanguageEntries().map(([code, name]) => ({ label: name, value: code, })), diff --git a/src/js/utils/font-loader.ts b/src/js/utils/font-loader.ts index 7d2bc83..b27c27e 100644 --- a/src/js/utils/font-loader.ts +++ b/src/js/utils/font-loader.ts @@ -1,281 +1,330 @@ -import { languageToFontFamily, fontFamilyToUrl } from '../config/font-mappings.js'; - -const fontCache: Map = new Map(); - -const DB_NAME = 'bentopdf-fonts'; -const DB_VERSION = 1; -const STORE_NAME = 'fonts'; - -async function openFontDB(): Promise { - return new Promise((resolve, reject) => { - const request = indexedDB.open(DB_NAME, DB_VERSION); - - request.onerror = () => reject(request.error); - request.onsuccess = () => resolve(request.result); - - request.onupgradeneeded = (event) => { - const db = (event.target as IDBOpenDBRequest).result; - if (!db.objectStoreNames.contains(STORE_NAME)) { - db.createObjectStore(STORE_NAME); - } - }; - }); -} - -async function getCachedFontFromDB(fontFamily: string): Promise { - try { - const db = await openFontDB(); - return new Promise((resolve, reject) => { - const transaction = db.transaction(STORE_NAME, 'readonly'); - const store = transaction.objectStore(STORE_NAME); - const request = store.get(fontFamily); - - request.onsuccess = () => resolve(request.result || null); - request.onerror = () => reject(request.error); - }); - } catch (error) { - console.warn('IndexedDB read failed:', error); - return null; - } -} - -async function saveFontToDB(fontFamily: string, fontBuffer: ArrayBuffer): Promise { - try { - const db = await openFontDB(); - return new Promise((resolve, reject) => { - const transaction = db.transaction(STORE_NAME, 'readwrite'); - const store = transaction.objectStore(STORE_NAME); - const request = store.put(fontBuffer, fontFamily); - - request.onsuccess = () => resolve(); - request.onerror = () => reject(request.error); - }); - } catch (error) { - console.warn('IndexedDB write failed:', error); - } -} - -export async function getFontForLanguage(lang: string): Promise { - const fontFamily = languageToFontFamily[lang] || 'Noto Sans'; - - if (fontCache.has(fontFamily)) { - return fontCache.get(fontFamily)!; - } - const cachedFont = await getCachedFontFromDB(fontFamily); - if (cachedFont) { - fontCache.set(fontFamily, cachedFont); - return cachedFont; - } - - try { - const fontUrl = fontFamilyToUrl[fontFamily] || fontFamilyToUrl['Noto Sans']; - - const fontResponse = await fetch(fontUrl); - - if (!fontResponse.ok) { - throw new Error(`Failed to fetch font file: ${fontResponse.statusText}`); - } - - const fontBuffer = await fontResponse.arrayBuffer(); - - fontCache.set(fontFamily, fontBuffer); - await saveFontToDB(fontFamily, fontBuffer); - - return fontBuffer; - } catch (error) { - console.warn(`Failed to fetch font for ${lang} (${fontFamily}), falling back to default.`, error); - - if (fontFamily !== 'Noto Sans') { - return await getFontForLanguage('eng'); - } - - throw error; - } -} - -export function detectScripts(text: string): string[] { - const scripts = new Set(); - - // Japanese: Hiragana (\u3040-\u309F) & Katakana (\u30A0-\u30FF) - if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) { - scripts.add('jpn'); - } - - // Korean: Hangul Syllables (\uAC00-\uD7A3) & Jamo (\u1100-\u11FF) - if (/[\uAC00-\uD7A3\u1100-\u11FF]/.test(text)) { - scripts.add('kor'); - } - - // Chinese: CJK Unified Ideographs (\u4E00-\u9FFF) & Ext A (\u3400-\u4DBF) - if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) { - scripts.add('chi_sim'); - } - - // Check for Arabic - if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(text)) { - scripts.add('ara'); - } - - // Check for Devanagari (Hindi, Marathi, etc.) - if (/[\u0900-\u097F]/.test(text)) scripts.add('hin'); - - // Check for Bengali - if (/[\u0980-\u09FF]/.test(text)) scripts.add('ben'); - - // Check for Tamil - if (/[\u0B80-\u0BFF]/.test(text)) scripts.add('tam'); - - // Check for Telugu - if (/[\u0C00-\u0C7F]/.test(text)) scripts.add('tel'); - - // Check for Kannada - if (/[\u0C80-\u0CFF]/.test(text)) scripts.add('kan'); - - // Check for Malayalam - if (/[\u0D00-\u0D7F]/.test(text)) scripts.add('mal'); - - // Check for Gujarati - if (/[\u0A80-\u0AFF]/.test(text)) scripts.add('guj'); - - // Check for Punjabi (Gurmukhi) - if (/[\u0A00-\u0A7F]/.test(text)) scripts.add('pan'); - - // Check for Oriya - if (/[\u0B00-\u0B7F]/.test(text)) scripts.add('ori'); - - // Check for Sinhala - if (/[\u0D80-\u0DFF]/.test(text)) scripts.add('sin'); - - // Check for Thai - if (/[\u0E00-\u0E7F]/.test(text)) scripts.add('tha'); - - // Check for Lao - if (/[\u0E80-\u0EFF]/.test(text)) scripts.add('lao'); - - // Check for Khmer - if (/[\u1780-\u17FF]/.test(text)) scripts.add('khm'); - - // Check for Myanmar - if (/[\u1000-\u109F]/.test(text)) scripts.add('mya'); - - // Check for Tibetan - if (/[\u0F00-\u0FFF]/.test(text)) scripts.add('bod'); - - // Check for Georgian - if (/[\u10A0-\u10FF]/.test(text)) scripts.add('kat'); - - // Check for Armenian - if (/[\u0530-\u058F]/.test(text)) scripts.add('hye'); - - // Check for Hebrew - if (/[\u0590-\u05FF]/.test(text)) scripts.add('heb'); - - // Check for Ethiopic - if (/[\u1200-\u137F]/.test(text)) scripts.add('amh'); - - // Check for Cherokee - if (/[\u13A0-\u13FF]/.test(text)) scripts.add('chr'); - - // Check for Syriac - if (/[\u0700-\u074F]/.test(text)) scripts.add('syr'); - - if (scripts.size === 0 || /[a-zA-Z]/.test(text)) { - scripts.add('eng'); - } - - return Array.from(scripts); -} - -export function getLanguageForChar(char: string): string { - const code = char.charCodeAt(0); - - // Latin (Basic + Supplement + Extended) - if (code <= 0x024F) return 'eng'; - - // Japanese: Hiragana & Katakana - if ( - (code >= 0x3040 && code <= 0x309F) || // Hiragana - (code >= 0x30A0 && code <= 0x30FF) // Katakana - ) return 'jpn'; - - // Korean: Hangul Syllables & Jamo - if ( - (code >= 0xAC00 && code <= 0xD7A3) || // Hangul Syllables - (code >= 0x1100 && code <= 0x11FF) // Hangul Jamo - ) return 'kor'; - - // Chinese: CJK Unified Ideographs (Han) - if ( - (code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified - (code >= 0x3400 && code <= 0x4DBF) // CJK Ext A - ) return 'chi_sim'; - - // Arabic - if ((code >= 0x0600 && code <= 0x06FF) || (code >= 0x0750 && code <= 0x077F) || (code >= 0x08A0 && code <= 0x08FF)) return 'ara'; - - // Devanagari - if (code >= 0x0900 && code <= 0x097F) return 'hin'; - - // Bengali - if (code >= 0x0980 && code <= 0x09FF) return 'ben'; - - // Tamil - if (code >= 0x0B80 && code <= 0x0BFF) return 'tam'; - - // Telugu - if (code >= 0x0C00 && code <= 0x0C7F) return 'tel'; - - // Kannada - if (code >= 0x0C80 && code <= 0x0CFF) return 'kan'; - - // Malayalam - if (code >= 0x0D00 && code <= 0x0D7F) return 'mal'; - - // Gujarati - if (code >= 0x0A80 && code <= 0x0AFF) return 'guj'; - - // Punjabi (Gurmukhi) - if (code >= 0x0A00 && code <= 0x0A7F) return 'pan'; - - // Oriya - if (code >= 0x0B00 && code <= 0x0B7F) return 'ori'; - - // Sinhala - if (code >= 0x0D80 && code <= 0x0DFF) return 'sin'; - - // Thai - if (code >= 0x0E00 && code <= 0x0E7F) return 'tha'; - - // Lao - if (code >= 0x0E80 && code <= 0x0EFF) return 'lao'; - - // Khmer - if (code >= 0x1780 && code <= 0x17FF) return 'khm'; - - // Myanmar - if (code >= 0x1000 && code <= 0x109F) return 'mya'; - - // Tibetan - if (code >= 0x0F00 && code <= 0x0FFF) return 'bod'; - - // Georgian - if (code >= 0x10A0 && code <= 0x10FF) return 'kat'; - - // Armenian - if (code >= 0x0530 && code <= 0x058F) return 'hye'; - - // Hebrew - if (code >= 0x0590 && code <= 0x05FF) return 'heb'; - - // Ethiopic - if (code >= 0x1200 && code <= 0x137F) return 'amh'; - - // Cherokee - if (code >= 0x13A0 && code <= 0x13FF) return 'chr'; - - // Syriac - if (code >= 0x0700 && code <= 0x074F) return 'syr'; - - // Default to English (Latin) - return 'eng'; -} +import { + getFontAssetFileName, + getFontUrlForFamily, + languageToFontFamily, +} from '../config/font-mappings.js'; + +const fontCache: Map = new Map(); + +const DB_NAME = 'bentopdf-fonts'; +const DB_VERSION = 1; +const STORE_NAME = 'fonts'; + +type OcrFontEnv = Partial>; + +function getDefaultFontEnv(): OcrFontEnv { + return import.meta.env; +} + +function normalizeFontBaseUrl(url?: string): string | undefined { + const trimmed = url?.trim(); + + if (!trimmed) { + return undefined; + } + + return trimmed.replace(/\/+$/, ''); +} + +export function resolveFontUrl( + fontFamily: string, + env: OcrFontEnv = getDefaultFontEnv() +): string { + const fontBaseUrl = normalizeFontBaseUrl(env.VITE_OCR_FONT_BASE_URL); + + if (fontBaseUrl) { + return `${fontBaseUrl}/${getFontAssetFileName(fontFamily)}`; + } + + return getFontUrlForFamily(fontFamily); +} + +async function openFontDB(): Promise { + return new Promise((resolve, reject) => { + const request = indexedDB.open(DB_NAME, DB_VERSION); + + request.onerror = () => reject(request.error); + request.onsuccess = () => resolve(request.result); + + request.onupgradeneeded = (event) => { + const db = (event.target as IDBOpenDBRequest).result; + if (!db.objectStoreNames.contains(STORE_NAME)) { + db.createObjectStore(STORE_NAME); + } + }; + }); +} + +async function getCachedFontFromDB( + fontFamily: string +): Promise { + try { + const db = await openFontDB(); + return new Promise((resolve, reject) => { + const transaction = db.transaction(STORE_NAME, 'readonly'); + const store = transaction.objectStore(STORE_NAME); + const request = store.get(fontFamily); + + request.onsuccess = () => resolve(request.result || null); + request.onerror = () => reject(request.error); + }); + } catch (error) { + console.warn('IndexedDB read failed:', error); + return null; + } +} + +async function saveFontToDB( + fontFamily: string, + fontBuffer: ArrayBuffer +): Promise { + try { + const db = await openFontDB(); + return new Promise((resolve, reject) => { + const transaction = db.transaction(STORE_NAME, 'readwrite'); + const store = transaction.objectStore(STORE_NAME); + const request = store.put(fontBuffer, fontFamily); + + request.onsuccess = () => resolve(); + request.onerror = () => reject(request.error); + }); + } catch (error) { + console.warn('IndexedDB write failed:', error); + } +} + +export async function getFontForLanguage(lang: string): Promise { + const fontFamily = languageToFontFamily[lang] || 'Noto Sans'; + + if (fontCache.has(fontFamily)) { + return fontCache.get(fontFamily)!; + } + const cachedFont = await getCachedFontFromDB(fontFamily); + if (cachedFont) { + fontCache.set(fontFamily, cachedFont); + return cachedFont; + } + + try { + const fontUrl = resolveFontUrl(fontFamily); + + const fontResponse = await fetch(fontUrl); + + if (!fontResponse.ok) { + throw new Error(`Failed to fetch font file: ${fontResponse.statusText}`); + } + + const fontBuffer = await fontResponse.arrayBuffer(); + + fontCache.set(fontFamily, fontBuffer); + await saveFontToDB(fontFamily, fontBuffer); + + return fontBuffer; + } catch (error) { + console.warn( + `Failed to fetch font for ${lang} (${fontFamily}), falling back to default.`, + error + ); + + if (fontFamily !== 'Noto Sans') { + return await getFontForLanguage('eng'); + } + + throw error; + } +} + +export function detectScripts(text: string): string[] { + const scripts = new Set(); + + // Japanese: Hiragana (\u3040-\u309F) & Katakana (\u30A0-\u30FF) + if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) { + scripts.add('jpn'); + } + + // Korean: Hangul Syllables (\uAC00-\uD7A3) & Jamo (\u1100-\u11FF) + if (/[\uAC00-\uD7A3\u1100-\u11FF]/.test(text)) { + scripts.add('kor'); + } + + // Chinese: CJK Unified Ideographs (\u4E00-\u9FFF) & Ext A (\u3400-\u4DBF) + if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) { + scripts.add('chi_sim'); + } + + // Check for Arabic + if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(text)) { + scripts.add('ara'); + } + + // Check for Devanagari (Hindi, Marathi, etc.) + if (/[\u0900-\u097F]/.test(text)) scripts.add('hin'); + + // Check for Bengali + if (/[\u0980-\u09FF]/.test(text)) scripts.add('ben'); + + // Check for Tamil + if (/[\u0B80-\u0BFF]/.test(text)) scripts.add('tam'); + + // Check for Telugu + if (/[\u0C00-\u0C7F]/.test(text)) scripts.add('tel'); + + // Check for Kannada + if (/[\u0C80-\u0CFF]/.test(text)) scripts.add('kan'); + + // Check for Malayalam + if (/[\u0D00-\u0D7F]/.test(text)) scripts.add('mal'); + + // Check for Gujarati + if (/[\u0A80-\u0AFF]/.test(text)) scripts.add('guj'); + + // Check for Punjabi (Gurmukhi) + if (/[\u0A00-\u0A7F]/.test(text)) scripts.add('pan'); + + // Check for Oriya + if (/[\u0B00-\u0B7F]/.test(text)) scripts.add('ori'); + + // Check for Sinhala + if (/[\u0D80-\u0DFF]/.test(text)) scripts.add('sin'); + + // Check for Thai + if (/[\u0E00-\u0E7F]/.test(text)) scripts.add('tha'); + + // Check for Lao + if (/[\u0E80-\u0EFF]/.test(text)) scripts.add('lao'); + + // Check for Khmer + if (/[\u1780-\u17FF]/.test(text)) scripts.add('khm'); + + // Check for Myanmar + if (/[\u1000-\u109F]/.test(text)) scripts.add('mya'); + + // Check for Tibetan + if (/[\u0F00-\u0FFF]/.test(text)) scripts.add('bod'); + + // Check for Georgian + if (/[\u10A0-\u10FF]/.test(text)) scripts.add('kat'); + + // Check for Armenian + if (/[\u0530-\u058F]/.test(text)) scripts.add('hye'); + + // Check for Hebrew + if (/[\u0590-\u05FF]/.test(text)) scripts.add('heb'); + + // Check for Ethiopic + if (/[\u1200-\u137F]/.test(text)) scripts.add('amh'); + + // Check for Cherokee + if (/[\u13A0-\u13FF]/.test(text)) scripts.add('chr'); + + // Check for Syriac + if (/[\u0700-\u074F]/.test(text)) scripts.add('syr'); + + if (scripts.size === 0 || /[a-zA-Z]/.test(text)) { + scripts.add('eng'); + } + + return Array.from(scripts); +} + +export function getLanguageForChar(char: string): string { + const code = char.charCodeAt(0); + + // Latin (Basic + Supplement + Extended) + if (code <= 0x024f) return 'eng'; + + // Japanese: Hiragana & Katakana + if ( + (code >= 0x3040 && code <= 0x309f) || // Hiragana + (code >= 0x30a0 && code <= 0x30ff) // Katakana + ) + return 'jpn'; + + // Korean: Hangul Syllables & Jamo + if ( + (code >= 0xac00 && code <= 0xd7a3) || // Hangul Syllables + (code >= 0x1100 && code <= 0x11ff) // Hangul Jamo + ) + return 'kor'; + + // Chinese: CJK Unified Ideographs (Han) + if ( + (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified + (code >= 0x3400 && code <= 0x4dbf) // CJK Ext A + ) + return 'chi_sim'; + + // Arabic + if ( + (code >= 0x0600 && code <= 0x06ff) || + (code >= 0x0750 && code <= 0x077f) || + (code >= 0x08a0 && code <= 0x08ff) + ) + return 'ara'; + + // Devanagari + if (code >= 0x0900 && code <= 0x097f) return 'hin'; + + // Bengali + if (code >= 0x0980 && code <= 0x09ff) return 'ben'; + + // Tamil + if (code >= 0x0b80 && code <= 0x0bff) return 'tam'; + + // Telugu + if (code >= 0x0c00 && code <= 0x0c7f) return 'tel'; + + // Kannada + if (code >= 0x0c80 && code <= 0x0cff) return 'kan'; + + // Malayalam + if (code >= 0x0d00 && code <= 0x0d7f) return 'mal'; + + // Gujarati + if (code >= 0x0a80 && code <= 0x0aff) return 'guj'; + + // Punjabi (Gurmukhi) + if (code >= 0x0a00 && code <= 0x0a7f) return 'pan'; + + // Oriya + if (code >= 0x0b00 && code <= 0x0b7f) return 'ori'; + + // Sinhala + if (code >= 0x0d80 && code <= 0x0dff) return 'sin'; + + // Thai + if (code >= 0x0e00 && code <= 0x0e7f) return 'tha'; + + // Lao + if (code >= 0x0e80 && code <= 0x0eff) return 'lao'; + + // Khmer + if (code >= 0x1780 && code <= 0x17ff) return 'khm'; + + // Myanmar + if (code >= 0x1000 && code <= 0x109f) return 'mya'; + + // Tibetan + if (code >= 0x0f00 && code <= 0x0fff) return 'bod'; + + // Georgian + if (code >= 0x10a0 && code <= 0x10ff) return 'kat'; + + // Armenian + if (code >= 0x0530 && code <= 0x058f) return 'hye'; + + // Hebrew + if (code >= 0x0590 && code <= 0x05ff) return 'heb'; + + // Ethiopic + if (code >= 0x1200 && code <= 0x137f) return 'amh'; + + // Cherokee + if (code >= 0x13a0 && code <= 0x13ff) return 'chr'; + + // Syriac + if (code >= 0x0700 && code <= 0x074f) return 'syr'; + + // Default to English (Latin) + return 'eng'; +} diff --git a/src/js/utils/ocr.ts b/src/js/utils/ocr.ts index 5a38d39..931d3c1 100644 --- a/src/js/utils/ocr.ts +++ b/src/js/utils/ocr.ts @@ -1,7 +1,6 @@ import Tesseract from 'tesseract.js'; import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib'; import fontkit from '@pdf-lib/fontkit'; -import * as pdfjsLib from 'pdfjs-dist'; import { getFontForLanguage } from './font-loader.js'; import { OcrPage, OcrLine } from '@/types'; import { @@ -10,6 +9,7 @@ import { calculateSpaceTransform, } from './hocr-transform.js'; import { getPDFDocument } from './helpers.js'; +import { createConfiguredTesseractWorker } from './tesseract-runtime.js'; export interface OcrOptions { language: string; @@ -134,11 +134,13 @@ export async function performOcr( const { language, resolution, binarize, whitelist, onProgress } = options; const progress = onProgress || (() => {}); - const worker = await Tesseract.createWorker(language, 1, { - logger: function (m: { status: string; progress: number }) { + const worker = await createConfiguredTesseractWorker( + language, + 1, + function (m: { status: string; progress: number }) { progress(m.status, m.progress || 0); - }, - }); + } + ); await worker.setParameters({ tessjs_create_hocr: '1', diff --git a/src/js/utils/tesseract-language-availability.ts b/src/js/utils/tesseract-language-availability.ts new file mode 100644 index 0000000..16cff30 --- /dev/null +++ b/src/js/utils/tesseract-language-availability.ts @@ -0,0 +1,132 @@ +import { tesseractLanguages } from '../config/tesseract-languages.js'; + +export const TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY = + 'VITE_TESSERACT_AVAILABLE_LANGUAGES' as const; + +type TesseractAvailabilityEnv = Partial< + Pick +>; + +export type TesseractLanguageCode = keyof typeof tesseractLanguages; + +function getDefaultEnv(): TesseractAvailabilityEnv { + return import.meta.env; +} + +function normalizeLanguageCodes(value: string | string[]): string[] { + const rawCodes = Array.isArray(value) ? value : value.split(/[+,]/); + const seen = new Set(); + const normalizedCodes: string[] = []; + + for (const rawCode of rawCodes) { + const code = rawCode.trim(); + if (!code || seen.has(code)) { + continue; + } + seen.add(code); + normalizedCodes.push(code); + } + + return normalizedCodes; +} + +function formatLanguageLabel(code: string): string { + const label = tesseractLanguages[code as TesseractLanguageCode]; + return label ? `${label} (${code})` : code; +} + +export function resolveConfiguredTesseractAvailableLanguages( + env: TesseractAvailabilityEnv = getDefaultEnv() +): string[] | null { + const configuredLanguages = env.VITE_TESSERACT_AVAILABLE_LANGUAGES?.trim(); + if (!configuredLanguages) { + return null; + } + + return normalizeLanguageCodes(configuredLanguages); +} + +export function getAvailableTesseractLanguageEntries( + env: TesseractAvailabilityEnv = getDefaultEnv() +): Array<[TesseractLanguageCode, string]> { + const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(env); + const allEntries = Object.entries(tesseractLanguages) as Array< + [TesseractLanguageCode, string] + >; + + if (!configuredLanguages) { + return allEntries; + } + + const configuredSet = new Set(configuredLanguages); + return allEntries.filter(([code]) => configuredSet.has(code)); +} + +export function getUnavailableTesseractLanguages( + requestedLanguages: string | string[], + env: TesseractAvailabilityEnv = getDefaultEnv() +): string[] { + const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(env); + if (!configuredLanguages) { + return []; + } + + const configuredSet = new Set(configuredLanguages); + return normalizeLanguageCodes(requestedLanguages).filter( + (code) => !configuredSet.has(code) + ); +} + +export function formatTesseractLanguageList(codes: string[]): string { + return codes.map(formatLanguageLabel).join(', '); +} + +function buildUnsupportedLanguageMessage( + unavailableLanguages: string[], + availableLanguages: string[] +): string { + const unavailableText = formatTesseractLanguageList(unavailableLanguages); + const availableText = formatTesseractLanguageList(availableLanguages); + + return [ + `This BentoPDF build only bundles OCR data for ${availableText}.`, + `The requested OCR language is not available: ${unavailableText}.`, + 'Choose one of the bundled languages or rebuild the air-gapped bundle with the missing language added to --ocr-languages.', + ].join(' '); +} + +export class UnsupportedOcrLanguageError extends Error { + readonly unavailableLanguages: string[]; + readonly availableLanguages: string[]; + + constructor(unavailableLanguages: string[], availableLanguages: string[]) { + super( + buildUnsupportedLanguageMessage(unavailableLanguages, availableLanguages) + ); + this.name = 'UnsupportedOcrLanguageError'; + this.unavailableLanguages = unavailableLanguages; + this.availableLanguages = availableLanguages; + } +} + +export function assertTesseractLanguagesAvailable( + requestedLanguages: string | string[], + env: TesseractAvailabilityEnv = getDefaultEnv() +): void { + const availableLanguages = resolveConfiguredTesseractAvailableLanguages(env); + if (!availableLanguages) { + return; + } + + const unavailableLanguages = getUnavailableTesseractLanguages( + requestedLanguages, + env + ); + + if (unavailableLanguages.length > 0) { + throw new UnsupportedOcrLanguageError( + unavailableLanguages, + availableLanguages + ); + } +} diff --git a/src/js/utils/tesseract-runtime.ts b/src/js/utils/tesseract-runtime.ts new file mode 100644 index 0000000..3af7ff3 --- /dev/null +++ b/src/js/utils/tesseract-runtime.ts @@ -0,0 +1,130 @@ +import Tesseract from 'tesseract.js'; +import { + assertTesseractLanguagesAvailable, + TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY, +} from './tesseract-language-availability.js'; + +const TESSERACT_ENV_KEYS = [ + 'VITE_TESSERACT_WORKER_URL', + 'VITE_TESSERACT_CORE_URL', + 'VITE_TESSERACT_LANG_URL', +] as const; + +const TESSERACT_RUNTIME_ENV_KEYS = [ + ...TESSERACT_ENV_KEYS, + TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY, +] as const; + +type TesseractRuntimeEnvKey = (typeof TESSERACT_RUNTIME_ENV_KEYS)[number]; + +export type TesseractAssetEnv = Partial< + Pick +>; + +export interface TesseractAssetConfig { + workerPath?: string; + corePath?: string; + langPath?: string; +} + +export type TesseractLoggerMessage = Tesseract.LoggerMessage; +export type TesseractWorkerOptions = Partial; +export type TesseractWorker = Tesseract.Worker; + +function getDefaultTesseractAssetEnv(): TesseractAssetEnv { + return import.meta.env; +} + +function normalizeDirectoryUrl(url?: string): string | undefined { + const trimmed = url?.trim(); + if (!trimmed) return undefined; + return trimmed.replace(/\/+$/, ''); +} + +function normalizeFileUrl(url?: string): string | undefined { + const trimmed = url?.trim(); + if (!trimmed) return undefined; + return trimmed.replace(/\/+$/, ''); +} + +export function resolveTesseractAssetConfig( + env: TesseractAssetEnv = getDefaultTesseractAssetEnv() +): TesseractAssetConfig { + return { + workerPath: normalizeFileUrl(env.VITE_TESSERACT_WORKER_URL), + corePath: normalizeDirectoryUrl(env.VITE_TESSERACT_CORE_URL), + langPath: normalizeDirectoryUrl(env.VITE_TESSERACT_LANG_URL), + }; +} + +export function hasConfiguredTesseractOverrides( + config: TesseractAssetConfig = resolveTesseractAssetConfig() +): boolean { + return Boolean(config.workerPath || config.corePath || config.langPath); +} + +export function hasCompleteTesseractOverrides( + config: TesseractAssetConfig = resolveTesseractAssetConfig() +): boolean { + return Boolean(config.workerPath && config.corePath && config.langPath); +} + +export function getIncompleteTesseractOverrideKeys( + config: TesseractAssetConfig = resolveTesseractAssetConfig() +): Array<(typeof TESSERACT_ENV_KEYS)[number]> { + if (!hasConfiguredTesseractOverrides(config)) { + return []; + } + + return TESSERACT_ENV_KEYS.filter((key) => { + switch (key) { + case 'VITE_TESSERACT_WORKER_URL': + return !config.workerPath; + case 'VITE_TESSERACT_CORE_URL': + return !config.corePath; + case 'VITE_TESSERACT_LANG_URL': + return !config.langPath; + } + }); +} + +export function buildTesseractWorkerOptions( + logger?: TesseractWorkerOptions['logger'], + env: TesseractAssetEnv = getDefaultTesseractAssetEnv() +): TesseractWorkerOptions { + const config = resolveTesseractAssetConfig(env); + + if (!hasConfiguredTesseractOverrides(config)) { + return logger ? { logger } : {}; + } + + if (!hasCompleteTesseractOverrides(config)) { + const missing = getIncompleteTesseractOverrideKeys(config).join(', '); + throw new Error( + `Self-hosted OCR assets are partially configured. Set ${missing} together with the other Tesseract asset URLs.` + ); + } + + return { + ...(logger ? { logger } : {}), + workerPath: config.workerPath, + corePath: config.corePath, + langPath: config.langPath, + gzip: true, + }; +} + +export async function createConfiguredTesseractWorker( + language: string, + oem: Tesseract.OEM, + logger?: TesseractWorkerOptions['logger'], + env: TesseractAssetEnv = getDefaultTesseractAssetEnv() +): Promise { + assertTesseractLanguagesAvailable(language, env); + + return Tesseract.createWorker( + language, + oem, + buildTesseractWorkerOptions(logger, env) + ); +} diff --git a/src/pages/ocr-pdf.html b/src/pages/ocr-pdf.html index d7d2368..0baa3a1 100644 --- a/src/pages/ocr-pdf.html +++ b/src/pages/ocr-pdf.html @@ -214,6 +214,10 @@ >None

+ diff --git a/src/tests/compare/ocr-page.test.ts b/src/tests/compare/ocr-page.test.ts new file mode 100644 index 0000000..c98cfe2 --- /dev/null +++ b/src/tests/compare/ocr-page.test.ts @@ -0,0 +1,81 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const { createConfiguredTesseractWorker } = vi.hoisted(() => ({ + createConfiguredTesseractWorker: vi.fn(), +})); + +const mockWorker = { + recognize: vi.fn(), + terminate: vi.fn(), +}; + +vi.mock('../../js/utils/tesseract-runtime', () => ({ + createConfiguredTesseractWorker, +})); + +import { recognizePageCanvas } from '../../js/compare/engine/ocr-page'; + +describe('compare OCR page recognition', () => { + beforeEach(() => { + createConfiguredTesseractWorker.mockReset(); + mockWorker.recognize.mockReset(); + mockWorker.terminate.mockReset(); + createConfiguredTesseractWorker.mockResolvedValue(mockWorker); + }); + + it('uses the configured Tesseract worker and maps OCR words into compare text items', async () => { + const progress = vi.fn(); + const canvas = { + width: 300, + height: 150, + } as HTMLCanvasElement; + + mockWorker.recognize.mockResolvedValue({ + data: { + words: [ + { + text: 'Hello', + bbox: { x0: 10, y0: 20, x1: 60, y1: 40 }, + }, + { + text: 'world', + bbox: { x0: 70, y0: 20, x1: 120, y1: 40 }, + }, + ], + }, + }); + + const model = await recognizePageCanvas(canvas, 'eng', progress); + + expect(createConfiguredTesseractWorker).toHaveBeenCalledWith( + 'eng', + 1, + expect.any(Function) + ); + expect(mockWorker.recognize).toHaveBeenCalledWith(canvas); + expect(mockWorker.terminate).toHaveBeenCalledTimes(1); + expect(model.source).toBe('ocr'); + expect(model.hasText).toBe(true); + expect(model.plainText).toContain('Hello'); + expect(model.textItems).toHaveLength(1); + + const logger = createConfiguredTesseractWorker.mock + .calls[0][2] as (message: { status: string; progress: number }) => void; + logger({ status: 'recognizing text', progress: 0.5 }); + expect(progress).toHaveBeenCalledWith('recognizing text', 0.5); + }); + + it('terminates the worker when compare OCR fails', async () => { + const canvas = { + width: 300, + height: 150, + } as HTMLCanvasElement; + mockWorker.recognize.mockRejectedValueOnce(new Error('compare ocr failed')); + + await expect(recognizePageCanvas(canvas, 'eng')).rejects.toThrow( + 'compare ocr failed' + ); + + expect(mockWorker.terminate).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/tests/font-loader.test.ts b/src/tests/font-loader.test.ts new file mode 100644 index 0000000..dfadcf0 --- /dev/null +++ b/src/tests/font-loader.test.ts @@ -0,0 +1,28 @@ +import { describe, expect, it } from 'vitest'; + +import { getFontAssetFileName } from '../js/config/font-mappings'; +import { resolveFontUrl } from '../js/utils/font-loader'; + +describe('font-loader', () => { + it('uses the default public font URL when no offline font base URL is configured', () => { + expect(resolveFontUrl('Noto Sans', {})).toBe( + 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf' + ); + }); + + it('builds a self-hosted font URL when an OCR font base URL is configured', () => { + expect( + resolveFontUrl('Noto Sans Arabic', { + VITE_OCR_FONT_BASE_URL: 'https://internal.example.com/wasm/ocr/fonts/', + }) + ).toBe( + 'https://internal.example.com/wasm/ocr/fonts/NotoSansArabic-Regular.ttf' + ); + }); + + it('derives the bundled font asset file name from the default font URL', () => { + expect(getFontAssetFileName('Noto Sans SC')).toBe( + 'NotoSansCJKsc-Regular.otf' + ); + }); +}); diff --git a/src/tests/ocr.test.ts b/src/tests/ocr.test.ts new file mode 100644 index 0000000..97e175b --- /dev/null +++ b/src/tests/ocr.test.ts @@ -0,0 +1,185 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const { + createConfiguredTesseractWorker, + getPDFDocument, + getFontForLanguage, + parseHocrDocument, +} = vi.hoisted(() => ({ + createConfiguredTesseractWorker: vi.fn(), + getPDFDocument: vi.fn(), + getFontForLanguage: vi.fn(), + parseHocrDocument: vi.fn(), +})); + +const mockWorker = { + setParameters: vi.fn(), + recognize: vi.fn(), + terminate: vi.fn(), +}; + +const mockPdfPage = { + getViewport: vi.fn(() => ({ width: 200, height: 100 })), + render: vi.fn(() => ({ promise: Promise.resolve() })), +}; + +const mockPdfOutputPage = { + drawImage: vi.fn(), + drawText: vi.fn(), +}; + +const mockPdfDoc = { + registerFontkit: vi.fn(), + embedFont: vi.fn(async () => ({ widthOfTextAtSize: vi.fn(() => 12) })), + addPage: vi.fn(() => mockPdfOutputPage), + embedPng: vi.fn(async () => ({ id: 'png' })), + save: vi.fn(async () => new Uint8Array([1, 2, 3])), +}; + +vi.mock('../js/utils/tesseract-runtime', () => ({ + createConfiguredTesseractWorker, +})); + +vi.mock('../js/utils/helpers.js', () => ({ + getPDFDocument, +})); + +vi.mock('../js/utils/font-loader.js', () => ({ + getFontForLanguage, +})); + +vi.mock('../js/utils/hocr-transform.js', () => ({ + parseHocrDocument, + calculateWordTransform: vi.fn(), + calculateSpaceTransform: vi.fn(), +})); + +vi.mock('pdf-lib', () => ({ + PDFDocument: { + create: vi.fn(async () => mockPdfDoc), + }, + StandardFonts: { + Helvetica: 'Helvetica', + }, + rgb: vi.fn(() => ({ r: 0, g: 0, b: 0 })), +})); + +vi.mock('@pdf-lib/fontkit', () => ({ + default: {}, +})); + +import { performOcr } from '../js/utils/ocr'; + +describe('performOcr', () => { + const originalCreateElement = document.createElement.bind(document); + const originalFileReader = globalThis.FileReader; + + beforeEach(() => { + createConfiguredTesseractWorker.mockReset(); + getPDFDocument.mockReset(); + getFontForLanguage.mockReset(); + parseHocrDocument.mockReset(); + + mockWorker.setParameters.mockReset(); + mockWorker.recognize.mockReset(); + mockWorker.terminate.mockReset(); + mockPdfPage.getViewport.mockClear(); + mockPdfPage.render.mockClear(); + mockPdfOutputPage.drawImage.mockClear(); + mockPdfOutputPage.drawText.mockClear(); + mockPdfDoc.registerFontkit.mockClear(); + mockPdfDoc.embedFont.mockClear(); + mockPdfDoc.addPage.mockClear(); + mockPdfDoc.embedPng.mockClear(); + mockPdfDoc.save.mockClear(); + + createConfiguredTesseractWorker.mockResolvedValue(mockWorker); + getPDFDocument.mockReturnValue({ + promise: Promise.resolve({ + numPages: 1, + getPage: vi.fn(async () => mockPdfPage), + }), + }); + getFontForLanguage.mockResolvedValue(new Uint8Array([1, 2, 3])); + mockWorker.recognize.mockResolvedValue({ + data: { + text: 'Recognized text', + hocr: '', + }, + }); + + document.createElement = ((tagName: string) => { + if (tagName !== 'canvas') { + return originalCreateElement(tagName); + } + + return { + width: 0, + height: 0, + getContext: vi.fn(() => ({ + canvas: { width: 200, height: 100 }, + getImageData: vi.fn(() => ({ data: new Uint8ClampedArray(4) })), + putImageData: vi.fn(), + })), + toBlob: vi.fn((callback: (blob: Blob) => void) => { + callback( + new Blob([new Uint8Array([1, 2, 3])], { type: 'image/png' }) + ); + }), + } as unknown as HTMLCanvasElement; + }) as typeof document.createElement; + + globalThis.FileReader = class { + result: ArrayBuffer = new Uint8Array([1, 2, 3]).buffer; + onload: null | (() => void) = null; + onerror: null | (() => void) = null; + + readAsArrayBuffer() { + this.onload?.(); + } + } as unknown as typeof FileReader; + }); + + afterEach(() => { + document.createElement = originalCreateElement; + globalThis.FileReader = originalFileReader; + }); + + it('uses the configured Tesseract worker and terminates it after OCR completes', async () => { + const result = await performOcr(new Uint8Array([1, 2, 3]), { + language: 'eng', + resolution: 2, + binarize: false, + whitelist: '', + }); + + expect(createConfiguredTesseractWorker).toHaveBeenCalledWith( + 'eng', + 1, + expect.any(Function) + ); + expect(mockWorker.setParameters).toHaveBeenCalledWith({ + tessjs_create_hocr: '1', + tessedit_pageseg_mode: '3', + }); + expect(mockWorker.recognize).toHaveBeenCalledTimes(1); + expect(mockWorker.terminate).toHaveBeenCalledTimes(1); + expect(result.fullText).toContain('Recognized text'); + expect(result.pdfBytes).toBeInstanceOf(Uint8Array); + }); + + it('terminates the Tesseract worker when OCR fails', async () => { + mockWorker.recognize.mockRejectedValueOnce(new Error('ocr failed')); + + await expect( + performOcr(new Uint8Array([1, 2, 3]), { + language: 'eng', + resolution: 2, + binarize: false, + whitelist: '', + }) + ).rejects.toThrow('ocr failed'); + + expect(mockWorker.terminate).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/tests/tesseract-runtime.test.ts b/src/tests/tesseract-runtime.test.ts new file mode 100644 index 0000000..748aaa7 --- /dev/null +++ b/src/tests/tesseract-runtime.test.ts @@ -0,0 +1,128 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const { createWorker } = vi.hoisted(() => ({ + createWorker: vi.fn(), +})); + +vi.mock('tesseract.js', () => ({ + default: { + createWorker, + }, +})); + +import { + buildTesseractWorkerOptions, + createConfiguredTesseractWorker, + getIncompleteTesseractOverrideKeys, + hasCompleteTesseractOverrides, + hasConfiguredTesseractOverrides, + resolveTesseractAssetConfig, +} from '../js/utils/tesseract-runtime'; +import { + assertTesseractLanguagesAvailable, + getAvailableTesseractLanguageEntries, + getUnavailableTesseractLanguages, + UnsupportedOcrLanguageError, +} from '../js/utils/tesseract-language-availability'; + +describe('tesseract-runtime', () => { + beforeEach(() => { + createWorker.mockReset(); + }); + + it('normalizes self-hosted OCR asset URLs', () => { + const config = resolveTesseractAssetConfig({ + VITE_TESSERACT_WORKER_URL: + 'https://internal.example.com/ocr/worker.min.js/', + VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core/', + VITE_TESSERACT_LANG_URL: 'https://internal.example.com/ocr/lang-data/', + }); + + expect(config).toEqual({ + workerPath: 'https://internal.example.com/ocr/worker.min.js', + corePath: 'https://internal.example.com/ocr/core', + langPath: 'https://internal.example.com/ocr/lang-data', + }); + expect(hasConfiguredTesseractOverrides(config)).toBe(true); + expect(hasCompleteTesseractOverrides(config)).toBe(true); + }); + + it('returns logger-only options when no self-hosted OCR assets are configured', () => { + const logger = vi.fn(); + + expect(buildTesseractWorkerOptions(logger, {})).toEqual({ logger }); + expect( + hasConfiguredTesseractOverrides(resolveTesseractAssetConfig({})) + ).toBe(false); + }); + + it('throws on partial OCR asset configuration', () => { + const env = { + VITE_TESSERACT_WORKER_URL: + 'https://internal.example.com/ocr/worker.min.js', + VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core', + }; + + expect( + getIncompleteTesseractOverrideKeys(resolveTesseractAssetConfig(env)) + ).toEqual(['VITE_TESSERACT_LANG_URL']); + expect(() => buildTesseractWorkerOptions(undefined, env)).toThrow( + 'Self-hosted OCR assets are partially configured' + ); + }); + + it('passes configured OCR asset URLs to Tesseract.createWorker', async () => { + const logger = vi.fn(); + createWorker.mockResolvedValue({ id: 'worker' }); + + await createConfiguredTesseractWorker('eng', 1, logger, { + VITE_TESSERACT_WORKER_URL: + 'https://internal.example.com/ocr/worker.min.js', + VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core', + VITE_TESSERACT_LANG_URL: 'https://internal.example.com/ocr/lang-data', + }); + + expect(createWorker).toHaveBeenCalledWith('eng', 1, { + logger, + workerPath: 'https://internal.example.com/ocr/worker.min.js', + corePath: 'https://internal.example.com/ocr/core', + langPath: 'https://internal.example.com/ocr/lang-data', + gzip: true, + }); + }); + + it('filters OCR language entries when the build restricts bundled languages', () => { + expect( + getAvailableTesseractLanguageEntries({ + VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu', + }) + ).toEqual([ + ['eng', 'English'], + ['deu', 'German'], + ]); + }); + + it('reports unavailable OCR languages for restricted air-gap builds', () => { + expect( + getUnavailableTesseractLanguages('eng+fra', { + VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu', + }) + ).toEqual(['fra']); + + expect(() => + assertTesseractLanguagesAvailable('eng+fra', { + VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu', + }) + ).toThrow(UnsupportedOcrLanguageError); + }); + + it('blocks worker creation when OCR requests an unbundled language', async () => { + await expect( + createConfiguredTesseractWorker('fra', 1, undefined, { + VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu', + }) + ).rejects.toThrow('This BentoPDF build only bundles OCR data for'); + + expect(createWorker).not.toHaveBeenCalled(); + }); +}); diff --git a/src/types/globals.d.ts b/src/types/globals.d.ts index 48c971c..aee6f2f 100644 --- a/src/types/globals.d.ts +++ b/src/types/globals.d.ts @@ -1 +1,15 @@ +/// + +interface ImportMetaEnv { + readonly VITE_TESSERACT_WORKER_URL?: string; + readonly VITE_TESSERACT_CORE_URL?: string; + readonly VITE_TESSERACT_LANG_URL?: string; + readonly VITE_TESSERACT_AVAILABLE_LANGUAGES?: string; + readonly VITE_OCR_FONT_BASE_URL?: string; +} + +interface ImportMeta { + readonly env: ImportMetaEnv; +} + declare const __SIMPLE_MODE__: boolean;