feat: integrate Tesseract.js with improved language availability and font handling
- Refactored OCR page recognition to utilize a configured Tesseract worker. - Added functions to manage font URLs and asset filenames based on language. - Implemented language availability checks and error handling for unsupported languages. - Enhanced PDF workflow to display available OCR languages and handle user selections. - Introduced utility functions for resolving Tesseract asset configurations. - Added tests for OCR functionality, font loading, and Tesseract runtime behavior. - Updated global types to include environment variables for Tesseract and font configurations.
This commit is contained in:
@@ -12,6 +12,15 @@ VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.1
|
|||||||
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
|
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
|
||||||
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
|
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
|
||||||
|
|
||||||
|
# OCR assets (optional)
|
||||||
|
# Set all three together for self-hosted or air-gapped OCR.
|
||||||
|
# Leave empty to use Tesseract.js runtime defaults.
|
||||||
|
VITE_TESSERACT_WORKER_URL=
|
||||||
|
VITE_TESSERACT_CORE_URL=
|
||||||
|
VITE_TESSERACT_LANG_URL=
|
||||||
|
VITE_TESSERACT_AVAILABLE_LANGUAGES=
|
||||||
|
VITE_OCR_FONT_BASE_URL=
|
||||||
|
|
||||||
# Default UI language (build-time)
|
# Default UI language (build-time)
|
||||||
# Supported: en, ar, be, fr, de, es, zh, zh-TW, vi, tr, id, it, pt, nl, da
|
# Supported: en, ar, be, fr, de, es, zh, zh-TW, vi, tr, id, it, pt, nl, da
|
||||||
VITE_DEFAULT_LANGUAGE=
|
VITE_DEFAULT_LANGUAGE=
|
||||||
|
|||||||
12
Dockerfile
12
Dockerfile
@@ -35,6 +35,18 @@ ENV VITE_WASM_PYMUPDF_URL=$VITE_WASM_PYMUPDF_URL
|
|||||||
ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL
|
ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL
|
||||||
ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL
|
ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL
|
||||||
|
|
||||||
|
# OCR asset URLs (optional, used for self-hosted or air-gapped OCR)
|
||||||
|
ARG VITE_TESSERACT_WORKER_URL
|
||||||
|
ARG VITE_TESSERACT_CORE_URL
|
||||||
|
ARG VITE_TESSERACT_LANG_URL
|
||||||
|
ARG VITE_TESSERACT_AVAILABLE_LANGUAGES
|
||||||
|
ARG VITE_OCR_FONT_BASE_URL
|
||||||
|
ENV VITE_TESSERACT_WORKER_URL=$VITE_TESSERACT_WORKER_URL
|
||||||
|
ENV VITE_TESSERACT_CORE_URL=$VITE_TESSERACT_CORE_URL
|
||||||
|
ENV VITE_TESSERACT_LANG_URL=$VITE_TESSERACT_LANG_URL
|
||||||
|
ENV VITE_TESSERACT_AVAILABLE_LANGUAGES=$VITE_TESSERACT_AVAILABLE_LANGUAGES
|
||||||
|
ENV VITE_OCR_FONT_BASE_URL=$VITE_OCR_FONT_BASE_URL
|
||||||
|
|
||||||
# Default UI language (e.g. en, fr, de, es, zh, ar)
|
# Default UI language (e.g. en, fr, de, es, zh, ar)
|
||||||
ARG VITE_DEFAULT_LANGUAGE
|
ARG VITE_DEFAULT_LANGUAGE
|
||||||
ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE
|
ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE
|
||||||
|
|||||||
@@ -32,6 +32,17 @@ ENV VITE_WASM_PYMUPDF_URL=$VITE_WASM_PYMUPDF_URL
|
|||||||
ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL
|
ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL
|
||||||
ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL
|
ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL
|
||||||
|
|
||||||
|
ARG VITE_TESSERACT_WORKER_URL
|
||||||
|
ARG VITE_TESSERACT_CORE_URL
|
||||||
|
ARG VITE_TESSERACT_LANG_URL
|
||||||
|
ARG VITE_TESSERACT_AVAILABLE_LANGUAGES
|
||||||
|
ARG VITE_OCR_FONT_BASE_URL
|
||||||
|
ENV VITE_TESSERACT_WORKER_URL=$VITE_TESSERACT_WORKER_URL
|
||||||
|
ENV VITE_TESSERACT_CORE_URL=$VITE_TESSERACT_CORE_URL
|
||||||
|
ENV VITE_TESSERACT_LANG_URL=$VITE_TESSERACT_LANG_URL
|
||||||
|
ENV VITE_TESSERACT_AVAILABLE_LANGUAGES=$VITE_TESSERACT_AVAILABLE_LANGUAGES
|
||||||
|
ENV VITE_OCR_FONT_BASE_URL=$VITE_OCR_FONT_BASE_URL
|
||||||
|
|
||||||
# Default UI language (e.g. en, fr, de, es, zh, ar)
|
# Default UI language (e.g. en, fr, de, es, zh, ar)
|
||||||
ARG VITE_DEFAULT_LANGUAGE
|
ARG VITE_DEFAULT_LANGUAGE
|
||||||
ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE
|
ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE
|
||||||
|
|||||||
86
README.md
86
README.md
@@ -465,6 +465,11 @@ The default URLs are set in `.env.production`:
|
|||||||
VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/
|
VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/
|
||||||
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
|
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
|
||||||
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
|
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
|
||||||
|
VITE_TESSERACT_WORKER_URL=
|
||||||
|
VITE_TESSERACT_CORE_URL=
|
||||||
|
VITE_TESSERACT_LANG_URL=
|
||||||
|
VITE_TESSERACT_AVAILABLE_LANGUAGES=
|
||||||
|
VITE_OCR_FONT_BASE_URL=
|
||||||
```
|
```
|
||||||
|
|
||||||
To override via Docker build args:
|
To override via Docker build args:
|
||||||
@@ -474,11 +479,18 @@ docker build \
|
|||||||
--build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \
|
--build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \
|
||||||
--build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \
|
--build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \
|
||||||
--build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \
|
--build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \
|
||||||
|
--build-arg VITE_TESSERACT_WORKER_URL=https://your-server.com/ocr/worker.min.js \
|
||||||
|
--build-arg VITE_TESSERACT_CORE_URL=https://your-server.com/ocr/core \
|
||||||
|
--build-arg VITE_TESSERACT_LANG_URL=https://your-server.com/ocr/lang-data \
|
||||||
|
--build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \
|
||||||
|
--build-arg VITE_OCR_FONT_BASE_URL=https://your-server.com/ocr/fonts \
|
||||||
-t bentopdf .
|
-t bentopdf .
|
||||||
```
|
```
|
||||||
|
|
||||||
To disable a module (require manual user config via Advanced Settings), set its variable to an empty string.
|
To disable a module (require manual user config via Advanced Settings), set its variable to an empty string.
|
||||||
|
|
||||||
|
For OCR, either leave all `VITE_TESSERACT_*` variables empty and use the default online assets, or set the worker/core/lang URLs together for self-hosted/offline OCR. If your self-hosted bundle only includes a subset such as `eng,deu`, also set `VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu` so the UI only shows bundled languages and OCR fails with a descriptive message for unsupported ones. For fully offline searchable-PDF output, also set `VITE_OCR_FONT_BASE_URL` to the internal directory that serves the bundled OCR text-layer fonts.
|
||||||
|
|
||||||
Users can also override these defaults per-browser via **Advanced Settings** in the UI — user overrides take priority over the environment defaults.
|
Users can also override these defaults per-browser via **Advanced Settings** in the UI — user overrides take priority over the environment defaults.
|
||||||
|
|
||||||
> [!IMPORTANT]
|
> [!IMPORTANT]
|
||||||
@@ -496,6 +508,12 @@ The included `prepare-airgap.sh` script automates the entire process — downloa
|
|||||||
git clone https://github.com/alam00000/bentopdf.git
|
git clone https://github.com/alam00000/bentopdf.git
|
||||||
cd bentopdf
|
cd bentopdf
|
||||||
|
|
||||||
|
# Show supported OCR language codes (for --ocr-languages)
|
||||||
|
bash scripts/prepare-airgap.sh --list-ocr-languages
|
||||||
|
|
||||||
|
# Search OCR language codes by name or abbreviation
|
||||||
|
bash scripts/prepare-airgap.sh --search-ocr-language german
|
||||||
|
|
||||||
# Interactive mode — prompts for all options
|
# Interactive mode — prompts for all options
|
||||||
bash scripts/prepare-airgap.sh
|
bash scripts/prepare-airgap.sh
|
||||||
|
|
||||||
@@ -508,7 +526,9 @@ This produces a bundle directory containing:
|
|||||||
```
|
```
|
||||||
bentopdf-airgap-bundle/
|
bentopdf-airgap-bundle/
|
||||||
bentopdf.tar # Docker image
|
bentopdf.tar # Docker image
|
||||||
*.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF)
|
*.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF, Tesseract)
|
||||||
|
tesseract-langdata/ # OCR traineddata files
|
||||||
|
ocr-fonts/ # OCR text-layer font files
|
||||||
setup.sh # Setup script for the air-gapped side
|
setup.sh # Setup script for the air-gapped side
|
||||||
README.md # Instructions
|
README.md # Instructions
|
||||||
```
|
```
|
||||||
@@ -525,23 +545,28 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
|
|||||||
<details>
|
<details>
|
||||||
<summary><strong>Script options</strong></summary>
|
<summary><strong>Script options</strong></summary>
|
||||||
|
|
||||||
| Flag | Description | Default |
|
| Flag | Description | Default |
|
||||||
| ----------------------- | ------------------------------------------------ | --------------------------------- |
|
| ------------------------------ | ------------------------------------------------ | --------------------------------- |
|
||||||
| `--wasm-base-url <url>` | Where WASMs will be hosted internally | _(required, prompted if missing)_ |
|
| `--wasm-base-url <url>` | Where WASMs will be hosted internally | _(required, prompted if missing)_ |
|
||||||
| `--image-name <name>` | Docker image tag | `bentopdf` |
|
| `--image-name <name>` | Docker image tag | `bentopdf` |
|
||||||
| `--output-dir <path>` | Output bundle directory | `./bentopdf-airgap-bundle` |
|
| `--output-dir <path>` | Output bundle directory | `./bentopdf-airgap-bundle` |
|
||||||
| `--simple-mode` | Enable Simple Mode | off |
|
| `--simple-mode` | Enable Simple Mode | off |
|
||||||
| `--base-url <path>` | Subdirectory base URL (e.g. `/pdf/`) | `/` |
|
| `--base-url <path>` | Subdirectory base URL (e.g. `/pdf/`) | `/` |
|
||||||
| `--language <code>` | Default UI language (e.g. `fr`, `de`) | _(none)_ |
|
| `--language <code>` | Default UI language (e.g. `fr`, `de`) | _(none)_ |
|
||||||
| `--brand-name <name>` | Custom brand name | _(none)_ |
|
| `--brand-name <name>` | Custom brand name | _(none)_ |
|
||||||
| `--brand-logo <path>` | Logo path relative to `public/` | _(none)_ |
|
| `--brand-logo <path>` | Logo path relative to `public/` | _(none)_ |
|
||||||
| `--footer-text <text>` | Custom footer text | _(none)_ |
|
| `--footer-text <text>` | Custom footer text | _(none)_ |
|
||||||
| `--dockerfile <path>` | Dockerfile to use | `Dockerfile` |
|
| `--ocr-languages <list>` | Comma-separated OCR languages to bundle | `eng` |
|
||||||
| `--skip-docker` | Skip Docker build and export | off |
|
| `--list-ocr-languages` | Print supported OCR codes and names, then exit | off |
|
||||||
| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
|
| `--search-ocr-language <term>` | Search OCR codes by name or abbreviation | off |
|
||||||
|
| `--dockerfile <path>` | Dockerfile to use | `Dockerfile` |
|
||||||
|
| `--skip-docker` | Skip Docker build and export | off |
|
||||||
|
| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
The interactive prompt also accepts `list` to print the full supported Tesseract code list and `search <term>` to find matches such as `search german` or `search chi`.
|
||||||
|
|
||||||
> [!IMPORTANT]
|
> [!IMPORTANT]
|
||||||
> WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`.
|
> WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`.
|
||||||
|
|
||||||
@@ -550,12 +575,18 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
|
|||||||
<details>
|
<details>
|
||||||
<summary>If you prefer to do it manually without the script</summary>
|
<summary>If you prefer to do it manually without the script</summary>
|
||||||
|
|
||||||
**Step 1: Download the WASM packages** (on a machine with internet)
|
**Step 1: Download the WASM and OCR packages** (on a machine with internet)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
npm pack @bentopdf/pymupdf-wasm@0.11.16
|
npm pack @bentopdf/pymupdf-wasm@0.11.16
|
||||||
npm pack @bentopdf/gs-wasm
|
npm pack @bentopdf/gs-wasm
|
||||||
npm pack coherentpdf
|
npm pack coherentpdf
|
||||||
|
npm pack tesseract.js@7.0.0
|
||||||
|
npm pack tesseract.js-core@7.0.0
|
||||||
|
mkdir -p tesseract-langdata
|
||||||
|
curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz
|
||||||
|
mkdir -p ocr-fonts
|
||||||
|
curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf
|
||||||
```
|
```
|
||||||
|
|
||||||
**Step 2: Build the Docker image with internal URLs**
|
**Step 2: Build the Docker image with internal URLs**
|
||||||
@@ -568,6 +599,10 @@ docker build \
|
|||||||
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
|
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
|
||||||
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
|
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
|
||||||
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
|
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
|
||||||
|
--build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \
|
||||||
|
--build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \
|
||||||
|
--build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \
|
||||||
|
--build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \
|
||||||
-t bentopdf .
|
-t bentopdf .
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -585,6 +620,10 @@ Copy these files via USB drive, internal artifact repository, or approved transf
|
|||||||
- `bentopdf-pymupdf-wasm-0.11.14.tgz` — PyMuPDF WASM package
|
- `bentopdf-pymupdf-wasm-0.11.14.tgz` — PyMuPDF WASM package
|
||||||
- `bentopdf-gs-wasm-*.tgz` — Ghostscript WASM package
|
- `bentopdf-gs-wasm-*.tgz` — Ghostscript WASM package
|
||||||
- `coherentpdf-*.tgz` — CoherentPDF WASM package
|
- `coherentpdf-*.tgz` — CoherentPDF WASM package
|
||||||
|
- `tesseract.js-7.0.0.tgz` — Tesseract worker package
|
||||||
|
- `tesseract.js-core-7.0.0.tgz` — Tesseract core runtime package
|
||||||
|
- `tesseract-langdata/` — OCR traineddata files
|
||||||
|
- `ocr-fonts/` — OCR text-layer font files
|
||||||
|
|
||||||
**Step 5: Set up inside the air-gapped network**
|
**Step 5: Set up inside the air-gapped network**
|
||||||
|
|
||||||
@@ -593,16 +632,23 @@ Copy these files via USB drive, internal artifact repository, or approved transf
|
|||||||
docker load -i bentopdf.tar
|
docker load -i bentopdf.tar
|
||||||
|
|
||||||
# Extract the WASM packages
|
# Extract the WASM packages
|
||||||
mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf
|
mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts
|
||||||
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1
|
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1
|
||||||
tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1
|
tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1
|
||||||
tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1
|
tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1
|
||||||
|
TEMP_TESS=$(mktemp -d)
|
||||||
|
tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS"
|
||||||
|
cp "$TEMP_TESS/package/dist/worker.min.js" ./wasm/ocr/worker.min.js
|
||||||
|
rm -rf "$TEMP_TESS"
|
||||||
|
tar xzf tesseract.js-core-7.0.0.tgz -C ./wasm/ocr/core --strip-components=1
|
||||||
|
cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/
|
||||||
|
cp ./ocr-fonts/* ./wasm/ocr/fonts/
|
||||||
|
|
||||||
# Run BentoPDF
|
# Run BentoPDF
|
||||||
docker run -d -p 3000:8080 --restart unless-stopped bentopdf
|
docker run -d -p 3000:8080 --restart unless-stopped bentopdf
|
||||||
```
|
```
|
||||||
|
|
||||||
Make sure the WASM files are accessible at the URLs you configured in Step 2.
|
Make sure the files are accessible at the URLs you configured in Step 2, including `.../ocr/worker.min.js`, `.../ocr/core`, `.../ocr/lang-data`, and `.../ocr/fonts`.
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@@ -613,6 +659,10 @@ Make sure the WASM files are accessible at the URLs you configured in Step 2.
|
|||||||
> VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/
|
> VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/
|
||||||
> VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/
|
> VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/
|
||||||
> VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/
|
> VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/
|
||||||
|
> VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js
|
||||||
|
> VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core
|
||||||
|
> VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data
|
||||||
|
> VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
**Subdirectory Hosting:**
|
**Subdirectory Hosting:**
|
||||||
|
|||||||
@@ -34,6 +34,9 @@ docker compose up -d
|
|||||||
|
|
||||||
Then open `http://localhost:3000` in your browser.
|
Then open `http://localhost:3000` in your browser.
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> If you are preparing an air-gapped OCR deployment, you must host the OCR text-layer fonts internally in addition to the Tesseract worker, core runtime, and traineddata files. The full setup is documented in [Self-Hosting](/self-hosting/), including `VITE_OCR_FONT_BASE_URL` and the bundled `ocr-fonts/` directory.
|
||||||
|
|
||||||
### Option 3: Build from Source
|
### Option 3: Build from Source
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -32,5 +32,11 @@ features:
|
|||||||
details: Convert, edit, merge, split, compress, sign, OCR, and more. Everything you need in one place.
|
details: Convert, edit, merge, split, compress, sign, OCR, and more. Everything you need in one place.
|
||||||
- icon: 🌐
|
- icon: 🌐
|
||||||
title: Self-Hostable
|
title: Self-Hostable
|
||||||
details: Deploy on your own infrastructure. Docker, Vercel, Netlify, AWS, or any static hosting.
|
details: Deploy on your own infrastructure. Docker, Vercel, Netlify, AWS, or fully air-gapped environments with self-hosted OCR workers, language data, and text-layer fonts.
|
||||||
|
|
||||||
|
## Offline OCR
|
||||||
|
|
||||||
|
If you self-host BentoPDF in an air-gapped or offline environment, OCR needs more than the Tesseract worker and traineddata files. Searchable PDF output also needs the OCR text-layer fonts to be served internally.
|
||||||
|
|
||||||
|
See [Self-Hosting](/self-hosting/) for the full setup, including `VITE_OCR_FONT_BASE_URL`, the bundled `ocr-fonts/` directory, and the updated air-gap workflow.
|
||||||
---
|
---
|
||||||
|
|||||||
@@ -90,20 +90,27 @@ docker run -d -p 3000:8080 bentopdf:custom
|
|||||||
|
|
||||||
## Environment Variables
|
## Environment Variables
|
||||||
|
|
||||||
| Variable | Description | Default |
|
| Variable | Description | Default |
|
||||||
| ----------------------- | ------------------------------- | -------------------------------------------------------------- |
|
| ------------------------------------ | ------------------------------------------- | -------------------------------------------------------------- |
|
||||||
| `SIMPLE_MODE` | Build without LibreOffice tools | `false` |
|
| `SIMPLE_MODE` | Build without LibreOffice tools | `false` |
|
||||||
| `BASE_URL` | Deploy to subdirectory | `/` |
|
| `BASE_URL` | Deploy to subdirectory | `/` |
|
||||||
| `VITE_WASM_PYMUPDF_URL` | PyMuPDF WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/` |
|
| `VITE_WASM_PYMUPDF_URL` | PyMuPDF WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/` |
|
||||||
| `VITE_WASM_GS_URL` | Ghostscript WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/` |
|
| `VITE_WASM_GS_URL` | Ghostscript WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/` |
|
||||||
| `VITE_WASM_CPDF_URL` | CoherentPDF WASM module URL | `https://cdn.jsdelivr.net/npm/coherentpdf/dist/` |
|
| `VITE_WASM_CPDF_URL` | CoherentPDF WASM module URL | `https://cdn.jsdelivr.net/npm/coherentpdf/dist/` |
|
||||||
| `VITE_DEFAULT_LANGUAGE` | Default UI language | `en` |
|
| `VITE_TESSERACT_WORKER_URL` | OCR worker script URL | _(empty; use Tesseract.js default CDN)_ |
|
||||||
| `VITE_BRAND_NAME` | Custom brand name | `BentoPDF` |
|
| `VITE_TESSERACT_CORE_URL` | OCR core runtime directory | _(empty; use Tesseract.js default CDN)_ |
|
||||||
| `VITE_BRAND_LOGO` | Logo path relative to `public/` | `images/favicon-no-bg.svg` |
|
| `VITE_TESSERACT_LANG_URL` | OCR traineddata directory | _(empty; use Tesseract.js default CDN)_ |
|
||||||
| `VITE_FOOTER_TEXT` | Custom footer/copyright text | `© 2026 BentoPDF. All rights reserved.` |
|
| `VITE_TESSERACT_AVAILABLE_LANGUAGES` | Comma-separated OCR languages exposed in UI | _(empty; show full catalog)_ |
|
||||||
|
| `VITE_OCR_FONT_BASE_URL` | OCR text-layer font directory | _(empty; use remote Noto font URLs)_ |
|
||||||
|
| `VITE_DEFAULT_LANGUAGE` | Default UI language | `en` |
|
||||||
|
| `VITE_BRAND_NAME` | Custom brand name | `BentoPDF` |
|
||||||
|
| `VITE_BRAND_LOGO` | Logo path relative to `public/` | `images/favicon-no-bg.svg` |
|
||||||
|
| `VITE_FOOTER_TEXT` | Custom footer/copyright text | `© 2026 BentoPDF. All rights reserved.` |
|
||||||
|
|
||||||
WASM module URLs are pre-configured with CDN defaults — all advanced features work out of the box. Override these for air-gapped or self-hosted deployments.
|
WASM module URLs are pre-configured with CDN defaults — all advanced features work out of the box. Override these for air-gapped or self-hosted deployments.
|
||||||
|
|
||||||
|
For OCR, leave the `VITE_TESSERACT_*` variables empty to use the default online assets, or set all three together for self-hosted/offline OCR. Partial OCR overrides are rejected because the worker, core runtime, and traineddata directory must match. For fully offline searchable PDF output, also set `VITE_OCR_FONT_BASE_URL` so the OCR text-layer fonts are loaded from your internal server instead of the public Noto font URLs.
|
||||||
|
|
||||||
`VITE_DEFAULT_LANGUAGE` sets the UI language for first-time visitors. Supported values: `en`, `ar`, `be`, `fr`, `de`, `es`, `zh`, `zh-TW`, `vi`, `tr`, `id`, `it`, `pt`, `nl`, `da`. Users can still switch languages — this only changes the default.
|
`VITE_DEFAULT_LANGUAGE` sets the UI language for first-time visitors. Supported values: `en`, `ar`, `be`, `fr`, `de`, `es`, `zh`, `zh-TW`, `vi`, `tr`, `id`, `it`, `pt`, `nl`, `da`. Users can still switch languages — this only changes the default.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
@@ -137,35 +144,59 @@ Branding works in both full mode and Simple Mode, and can be combined with all o
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 1. On a machine WITH internet — download WASM packages
|
# 1. On a machine WITH internet — download WASM packages
|
||||||
|
bash scripts/prepare-airgap.sh --list-ocr-languages
|
||||||
|
bash scripts/prepare-airgap.sh --search-ocr-language german
|
||||||
|
|
||||||
|
# 2. Download WASM/OCR packages
|
||||||
npm pack @bentopdf/pymupdf-wasm@0.11.14
|
npm pack @bentopdf/pymupdf-wasm@0.11.14
|
||||||
npm pack @bentopdf/gs-wasm
|
npm pack @bentopdf/gs-wasm
|
||||||
npm pack coherentpdf
|
npm pack coherentpdf
|
||||||
|
npm pack tesseract.js@7.0.0
|
||||||
|
npm pack tesseract.js-core@7.0.0
|
||||||
|
mkdir -p tesseract-langdata
|
||||||
|
curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz
|
||||||
|
mkdir -p ocr-fonts
|
||||||
|
curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf
|
||||||
|
|
||||||
# 2. Build the image with your internal server URLs
|
# 3. Build the image with your internal server URLs
|
||||||
docker build \
|
docker build \
|
||||||
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
|
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
|
||||||
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
|
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
|
||||||
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
|
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
|
||||||
|
--build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \
|
||||||
|
--build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \
|
||||||
|
--build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \
|
||||||
|
--build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \
|
||||||
|
--build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \
|
||||||
-t bentopdf .
|
-t bentopdf .
|
||||||
|
|
||||||
# 3. Export the image
|
# 4. Export the image
|
||||||
docker save bentopdf -o bentopdf.tar
|
docker save bentopdf -o bentopdf.tar
|
||||||
|
|
||||||
# 4. Transfer bentopdf.tar + the .tgz WASM packages into the air-gapped network
|
# 5. Transfer bentopdf.tar + the .tgz packages + tesseract-langdata/ + ocr-fonts/ into the air-gapped network
|
||||||
|
|
||||||
# 5. Inside the air-gapped network — load and run
|
# 6. Inside the air-gapped network — load and run
|
||||||
docker load -i bentopdf.tar
|
docker load -i bentopdf.tar
|
||||||
|
|
||||||
# Extract WASM packages to your internal web server
|
# Extract WASM packages to your internal web server
|
||||||
mkdir -p /var/www/wasm/pymupdf /var/www/wasm/gs /var/www/wasm/cpdf
|
mkdir -p /var/www/wasm/pymupdf /var/www/wasm/gs /var/www/wasm/cpdf /var/www/wasm/ocr/core /var/www/wasm/ocr/lang-data /var/www/wasm/ocr/fonts
|
||||||
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C /var/www/wasm/pymupdf --strip-components=1
|
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C /var/www/wasm/pymupdf --strip-components=1
|
||||||
tar xzf bentopdf-gs-wasm-*.tgz -C /var/www/wasm/gs --strip-components=1
|
tar xzf bentopdf-gs-wasm-*.tgz -C /var/www/wasm/gs --strip-components=1
|
||||||
tar xzf coherentpdf-*.tgz -C /var/www/wasm/cpdf --strip-components=1
|
tar xzf coherentpdf-*.tgz -C /var/www/wasm/cpdf --strip-components=1
|
||||||
|
TEMP_TESS=$(mktemp -d)
|
||||||
|
tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS"
|
||||||
|
cp "$TEMP_TESS/package/dist/worker.min.js" /var/www/wasm/ocr/worker.min.js
|
||||||
|
rm -rf "$TEMP_TESS"
|
||||||
|
tar xzf tesseract.js-core-7.0.0.tgz -C /var/www/wasm/ocr/core --strip-components=1
|
||||||
|
cp ./tesseract-langdata/*.traineddata.gz /var/www/wasm/ocr/lang-data/
|
||||||
|
cp ./ocr-fonts/* /var/www/wasm/ocr/fonts/
|
||||||
|
|
||||||
# Run BentoPDF
|
# Run BentoPDF
|
||||||
docker run -d -p 3000:8080 --restart unless-stopped bentopdf
|
docker run -d -p 3000:8080 --restart unless-stopped bentopdf
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Use the codes printed by `bash scripts/prepare-airgap.sh --list-ocr-languages`, or search by name with `bash scripts/prepare-airgap.sh --search-ocr-language <term>`, for `--ocr-languages`. When you build with a restricted OCR subset, pass the same codes to `VITE_TESSERACT_AVAILABLE_LANGUAGES` so the app only shows bundled languages. For full offline OCR output, also host the bundled `ocr-fonts/` directory and point `VITE_OCR_FONT_BASE_URL` at it.
|
||||||
|
|
||||||
Set a variable to empty string to disable that module (users must configure manually via Advanced Settings).
|
Set a variable to empty string to disable that module (users must configure manually via Advanced Settings).
|
||||||
|
|
||||||
## Custom User ID (PUID/PGID)
|
## Custom User ID (PUID/PGID)
|
||||||
|
|||||||
@@ -175,6 +175,11 @@ These are set in `.env.production` and baked into the build:
|
|||||||
VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/
|
VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/
|
||||||
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
|
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
|
||||||
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
|
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
|
||||||
|
VITE_TESSERACT_WORKER_URL=
|
||||||
|
VITE_TESSERACT_CORE_URL=
|
||||||
|
VITE_TESSERACT_LANG_URL=
|
||||||
|
VITE_TESSERACT_AVAILABLE_LANGUAGES=
|
||||||
|
VITE_OCR_FONT_BASE_URL=
|
||||||
```
|
```
|
||||||
|
|
||||||
### Overriding WASM URLs
|
### Overriding WASM URLs
|
||||||
@@ -187,6 +192,11 @@ docker build \
|
|||||||
--build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \
|
--build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \
|
||||||
--build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \
|
--build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \
|
||||||
--build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \
|
--build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \
|
||||||
|
--build-arg VITE_TESSERACT_WORKER_URL=https://your-server.com/ocr/worker.min.js \
|
||||||
|
--build-arg VITE_TESSERACT_CORE_URL=https://your-server.com/ocr/core \
|
||||||
|
--build-arg VITE_TESSERACT_LANG_URL=https://your-server.com/ocr/lang-data \
|
||||||
|
--build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \
|
||||||
|
--build-arg VITE_OCR_FONT_BASE_URL=https://your-server.com/ocr/fonts \
|
||||||
-t bentopdf .
|
-t bentopdf .
|
||||||
|
|
||||||
# Or via .env.production before building from source
|
# Or via .env.production before building from source
|
||||||
@@ -195,6 +205,8 @@ VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ npm run build
|
|||||||
|
|
||||||
To disable a module entirely (require manual user config via Advanced Settings), set its variable to an empty string.
|
To disable a module entirely (require manual user config via Advanced Settings), set its variable to an empty string.
|
||||||
|
|
||||||
|
For OCR, either leave all `VITE_TESSERACT_*` variables empty and keep the default online assets, or set the worker/core/lang URLs together for self-hosted/offline OCR. If you bundle only specific OCR languages, also set `VITE_TESSERACT_AVAILABLE_LANGUAGES` to the same comma-separated codes so the UI only offers installed languages and unsupported selections fail with a descriptive error. For fully offline searchable-PDF output, also set `VITE_OCR_FONT_BASE_URL` to the internal directory that serves the bundled OCR fonts.
|
||||||
|
|
||||||
Users can also override these defaults at any time via **Advanced Settings** in the UI — user overrides stored in the browser take priority over environment defaults.
|
Users can also override these defaults at any time via **Advanced Settings** in the UI — user overrides stored in the browser take priority over environment defaults.
|
||||||
|
|
||||||
### Air-Gapped / Offline Deployment
|
### Air-Gapped / Offline Deployment
|
||||||
@@ -209,6 +221,12 @@ The included `prepare-airgap.sh` script automates the entire process — downloa
|
|||||||
git clone https://github.com/alam00000/bentopdf.git
|
git clone https://github.com/alam00000/bentopdf.git
|
||||||
cd bentopdf
|
cd bentopdf
|
||||||
|
|
||||||
|
# Show supported OCR language codes (for --ocr-languages)
|
||||||
|
bash scripts/prepare-airgap.sh --list-ocr-languages
|
||||||
|
|
||||||
|
# Search OCR language codes by name or abbreviation
|
||||||
|
bash scripts/prepare-airgap.sh --search-ocr-language german
|
||||||
|
|
||||||
# Interactive mode — prompts for all options
|
# Interactive mode — prompts for all options
|
||||||
bash scripts/prepare-airgap.sh
|
bash scripts/prepare-airgap.sh
|
||||||
|
|
||||||
@@ -221,7 +239,9 @@ This produces a bundle directory:
|
|||||||
```
|
```
|
||||||
bentopdf-airgap-bundle/
|
bentopdf-airgap-bundle/
|
||||||
bentopdf.tar # Docker image
|
bentopdf.tar # Docker image
|
||||||
*.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF)
|
*.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF, Tesseract)
|
||||||
|
tesseract-langdata/ # OCR traineddata files
|
||||||
|
ocr-fonts/ # OCR text-layer font files
|
||||||
setup.sh # Setup script for the air-gapped side
|
setup.sh # Setup script for the air-gapped side
|
||||||
README.md # Instructions
|
README.md # Instructions
|
||||||
```
|
```
|
||||||
@@ -237,20 +257,25 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
|
|||||||
|
|
||||||
**Script options:**
|
**Script options:**
|
||||||
|
|
||||||
| Flag | Description | Default |
|
| Flag | Description | Default |
|
||||||
| ----------------------- | ------------------------------------------------ | --------------------------------- |
|
| ------------------------------ | ------------------------------------------------ | --------------------------------- |
|
||||||
| `--wasm-base-url <url>` | Where WASMs will be hosted internally | _(required, prompted if missing)_ |
|
| `--wasm-base-url <url>` | Where WASMs will be hosted internally | _(required, prompted if missing)_ |
|
||||||
| `--image-name <name>` | Docker image tag | `bentopdf` |
|
| `--image-name <name>` | Docker image tag | `bentopdf` |
|
||||||
| `--output-dir <path>` | Output bundle directory | `./bentopdf-airgap-bundle` |
|
| `--output-dir <path>` | Output bundle directory | `./bentopdf-airgap-bundle` |
|
||||||
| `--simple-mode` | Enable Simple Mode | off |
|
| `--simple-mode` | Enable Simple Mode | off |
|
||||||
| `--base-url <path>` | Subdirectory base URL (e.g. `/pdf/`) | `/` |
|
| `--base-url <path>` | Subdirectory base URL (e.g. `/pdf/`) | `/` |
|
||||||
| `--language <code>` | Default UI language (e.g. `fr`, `de`) | _(none)_ |
|
| `--language <code>` | Default UI language (e.g. `fr`, `de`) | _(none)_ |
|
||||||
| `--brand-name <name>` | Custom brand name | _(none)_ |
|
| `--brand-name <name>` | Custom brand name | _(none)_ |
|
||||||
| `--brand-logo <path>` | Logo path relative to `public/` | _(none)_ |
|
| `--brand-logo <path>` | Logo path relative to `public/` | _(none)_ |
|
||||||
| `--footer-text <text>` | Custom footer text | _(none)_ |
|
| `--footer-text <text>` | Custom footer text | _(none)_ |
|
||||||
| `--dockerfile <path>` | Dockerfile to use | `Dockerfile` |
|
| `--ocr-languages <list>` | Comma-separated OCR languages to bundle | `eng` |
|
||||||
| `--skip-docker` | Skip Docker build and export | off |
|
| `--list-ocr-languages` | Print supported OCR codes and names, then exit | off |
|
||||||
| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
|
| `--search-ocr-language <term>` | Search OCR codes by name or abbreviation | off |
|
||||||
|
| `--dockerfile <path>` | Dockerfile to use | `Dockerfile` |
|
||||||
|
| `--skip-docker` | Skip Docker build and export | off |
|
||||||
|
| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
|
||||||
|
|
||||||
|
The interactive prompt also accepts `list` to print the full supported Tesseract code list and `search <term>` to find matches such as `search german` or `search chi`.
|
||||||
|
|
||||||
::: warning Same-Origin Requirement
|
::: warning Same-Origin Requirement
|
||||||
WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`.
|
WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`.
|
||||||
@@ -261,12 +286,18 @@ WASM files must be served from the **same origin** as the BentoPDF app. Web Work
|
|||||||
<details>
|
<details>
|
||||||
<summary>If you prefer to do it manually without the script</summary>
|
<summary>If you prefer to do it manually without the script</summary>
|
||||||
|
|
||||||
**Step 1: Download the WASM packages** (on a machine with internet)
|
**Step 1: Download the WASM and OCR packages** (on a machine with internet)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
npm pack @bentopdf/pymupdf-wasm@0.11.14
|
npm pack @bentopdf/pymupdf-wasm@0.11.14
|
||||||
npm pack @bentopdf/gs-wasm
|
npm pack @bentopdf/gs-wasm
|
||||||
npm pack coherentpdf
|
npm pack coherentpdf
|
||||||
|
npm pack tesseract.js@7.0.0
|
||||||
|
npm pack tesseract.js-core@7.0.0
|
||||||
|
mkdir -p tesseract-langdata
|
||||||
|
curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz
|
||||||
|
mkdir -p ocr-fonts
|
||||||
|
curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf
|
||||||
```
|
```
|
||||||
|
|
||||||
**Step 2: Build the Docker image with internal URLs**
|
**Step 2: Build the Docker image with internal URLs**
|
||||||
@@ -279,6 +310,10 @@ docker build \
|
|||||||
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
|
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
|
||||||
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
|
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
|
||||||
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
|
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
|
||||||
|
--build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \
|
||||||
|
--build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \
|
||||||
|
--build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \
|
||||||
|
--build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \
|
||||||
-t bentopdf .
|
-t bentopdf .
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -293,7 +328,9 @@ docker save bentopdf -o bentopdf.tar
|
|||||||
Copy via USB, internal artifact repo, or approved transfer method:
|
Copy via USB, internal artifact repo, or approved transfer method:
|
||||||
|
|
||||||
- `bentopdf.tar` — the Docker image
|
- `bentopdf.tar` — the Docker image
|
||||||
- The three `.tgz` WASM packages from Step 1
|
- The five `.tgz` WASM/OCR packages from Step 1
|
||||||
|
- The `tesseract-langdata/` directory from Step 1
|
||||||
|
- The `ocr-fonts/` directory from Step 1
|
||||||
|
|
||||||
**Step 5: Set up inside the air-gapped network**
|
**Step 5: Set up inside the air-gapped network**
|
||||||
|
|
||||||
@@ -302,16 +339,23 @@ Copy via USB, internal artifact repo, or approved transfer method:
|
|||||||
docker load -i bentopdf.tar
|
docker load -i bentopdf.tar
|
||||||
|
|
||||||
# Extract WASM packages
|
# Extract WASM packages
|
||||||
mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf
|
mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts
|
||||||
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1
|
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1
|
||||||
tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1
|
tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1
|
||||||
tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1
|
tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1
|
||||||
|
TEMP_TESS=$(mktemp -d)
|
||||||
|
tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS"
|
||||||
|
cp "$TEMP_TESS/package/dist/worker.min.js" ./wasm/ocr/worker.min.js
|
||||||
|
rm -rf "$TEMP_TESS"
|
||||||
|
tar xzf tesseract.js-core-7.0.0.tgz -C ./wasm/ocr/core --strip-components=1
|
||||||
|
cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/
|
||||||
|
cp ./ocr-fonts/* ./wasm/ocr/fonts/
|
||||||
|
|
||||||
# Run BentoPDF
|
# Run BentoPDF
|
||||||
docker run -d -p 3000:8080 --restart unless-stopped bentopdf
|
docker run -d -p 3000:8080 --restart unless-stopped bentopdf
|
||||||
```
|
```
|
||||||
|
|
||||||
Make sure the WASM files are accessible at the URLs you configured in Step 2.
|
Make sure the files are accessible at the URLs you configured in Step 2, including `.../ocr/worker.min.js`, `.../ocr/core`, `.../ocr/lang-data`, and `.../ocr/fonts`.
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@@ -322,6 +366,10 @@ Set the variables in `.env.production` before running `npm run build`:
|
|||||||
VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/
|
VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/
|
||||||
VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/
|
VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/
|
||||||
VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/
|
VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/
|
||||||
|
VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js
|
||||||
|
VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core
|
||||||
|
VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data
|
||||||
|
VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts
|
||||||
```
|
```
|
||||||
|
|
||||||
:::
|
:::
|
||||||
|
|||||||
@@ -13,6 +13,8 @@ set -euo pipefail
|
|||||||
# Usage:
|
# Usage:
|
||||||
# bash scripts/prepare-airgap.sh --wasm-base-url https://internal.example.com/wasm
|
# bash scripts/prepare-airgap.sh --wasm-base-url https://internal.example.com/wasm
|
||||||
# bash scripts/prepare-airgap.sh # interactive mode
|
# bash scripts/prepare-airgap.sh # interactive mode
|
||||||
|
# bash scripts/prepare-airgap.sh --ocr-languages eng,deu,fra
|
||||||
|
# bash scripts/prepare-airgap.sh --search-ocr-language german
|
||||||
#
|
#
|
||||||
# See --help for all options.
|
# See --help for all options.
|
||||||
# ============================================================
|
# ============================================================
|
||||||
@@ -54,6 +56,110 @@ DOCKERFILE="Dockerfile"
|
|||||||
SKIP_DOCKER=false
|
SKIP_DOCKER=false
|
||||||
SKIP_WASM=false
|
SKIP_WASM=false
|
||||||
INTERACTIVE=false
|
INTERACTIVE=false
|
||||||
|
OCR_LANGUAGES="eng"
|
||||||
|
TESSDATA_VERSION="4.0.0_best_int"
|
||||||
|
LIST_OCR_LANGUAGES=false
|
||||||
|
SEARCH_OCR_LANGUAGE_TERM=""
|
||||||
|
|
||||||
|
TESSERACT_LANGUAGE_CONFIG="src/js/config/tesseract-languages.ts"
|
||||||
|
FONT_MAPPING_CONFIG="src/js/config/font-mappings.ts"
|
||||||
|
|
||||||
|
SUPPORTED_OCR_LANGUAGES_RAW=""
|
||||||
|
OCR_FONT_MANIFEST_RAW=""
|
||||||
|
|
||||||
|
load_supported_ocr_languages() {
|
||||||
|
if [ -n "$SUPPORTED_OCR_LANGUAGES_RAW" ]; then
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$TESSERACT_LANGUAGE_CONFIG" ]; then
|
||||||
|
error "Missing OCR language config: ${TESSERACT_LANGUAGE_CONFIG}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUPPORTED_OCR_LANGUAGES_RAW=$(node -e "const fs = require('fs'); const source = fs.readFileSync(process.argv[1], 'utf8'); const languages = []; const pattern = /^\\s*([a-z0-9_]+):\\s*'([^']+)'/gm; let match; while ((match = pattern.exec(source)) !== null) { languages.push(match[1] + '\\t' + match[2]); } process.stdout.write(languages.join('\\n'));" "$TESSERACT_LANGUAGE_CONFIG")
|
||||||
|
|
||||||
|
if [ -z "$SUPPORTED_OCR_LANGUAGES_RAW" ]; then
|
||||||
|
error "Failed to load supported OCR languages from ${TESSERACT_LANGUAGE_CONFIG}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
is_supported_ocr_language() {
|
||||||
|
local code="$1"
|
||||||
|
load_supported_ocr_languages
|
||||||
|
printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' -v code="$code" '$1 == code { found = 1 } END { exit found ? 0 : 1 }'
|
||||||
|
}
|
||||||
|
|
||||||
|
show_supported_ocr_languages() {
|
||||||
|
load_supported_ocr_languages
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${BOLD}Supported OCR languages:${NC}"
|
||||||
|
echo " Use the code in the left column for --ocr-languages."
|
||||||
|
echo ""
|
||||||
|
printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' '{ printf " %-12s %s\n", $1, $2 }'
|
||||||
|
echo ""
|
||||||
|
echo " Example: --ocr-languages eng,deu,fra,spa"
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
show_matching_ocr_languages() {
|
||||||
|
local query="$1"
|
||||||
|
load_supported_ocr_languages
|
||||||
|
|
||||||
|
if [ -z "$query" ]; then
|
||||||
|
error "OCR language search requires a non-empty query."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local matches
|
||||||
|
matches=$(printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' -v query="$query" '
|
||||||
|
BEGIN {
|
||||||
|
normalized = tolower(query)
|
||||||
|
}
|
||||||
|
{
|
||||||
|
code = tolower($1)
|
||||||
|
name = tolower($2)
|
||||||
|
if (index(code, normalized) || index(name, normalized)) {
|
||||||
|
printf "%s\t%s\n", $1, $2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
')
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${BOLD}OCR language search:${NC} ${query}"
|
||||||
|
|
||||||
|
if [ -z "$matches" ]; then
|
||||||
|
echo " No supported OCR languages matched that query."
|
||||||
|
echo " Tip: run --list-ocr-languages to browse the full list."
|
||||||
|
echo ""
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo " Matching codes for --ocr-languages:"
|
||||||
|
echo ""
|
||||||
|
printf '%s\n' "$matches" | awk -F '\t' '{ printf " %-12s %s\n", $1, $2 }'
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
load_required_ocr_fonts() {
|
||||||
|
if [ -n "$OCR_FONT_MANIFEST_RAW" ]; then
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$FONT_MAPPING_CONFIG" ]; then
|
||||||
|
error "Missing OCR font mapping config: ${FONT_MAPPING_CONFIG}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
OCR_FONT_MANIFEST_RAW=$(node -e "const fs = require('fs'); const source = fs.readFileSync(process.argv[1], 'utf8'); const selected = (process.argv[2] || '').split(',').map((value) => value.trim()).filter(Boolean); const sections = source.split('export const fontFamilyToUrl'); const languageSection = sections[0] || ''; const fontSection = sections[1] || ''; const languageToFamily = {}; const fontFamilyToUrl = {}; let match; const languagePattern = /^\s*([a-z_]+):\s*'([^']+)',/gm; while ((match = languagePattern.exec(languageSection)) !== null) { languageToFamily[match[1]] = match[2]; } const fontPattern = /^\s*'([^']+)':\s*'([^']+)',/gm; while ((match = fontPattern.exec(fontSection)) !== null) { fontFamilyToUrl[match[1]] = match[2]; } const families = new Set(['Noto Sans']); for (const lang of selected) { families.add(languageToFamily[lang] || 'Noto Sans'); } const lines = Array.from(families).sort().map((family) => { const url = fontFamilyToUrl[family] || fontFamilyToUrl['Noto Sans']; const fileName = url.split('/').pop(); return [family, url, fileName].join('\t'); }); process.stdout.write(lines.join('\n'));" "$FONT_MAPPING_CONFIG" "$OCR_LANGUAGES")
|
||||||
|
|
||||||
|
if [ -z "$OCR_FONT_MANIFEST_RAW" ]; then
|
||||||
|
error "Failed to resolve OCR font assets from ${FONT_MAPPING_CONFIG}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# --- Usage ---
|
# --- Usage ---
|
||||||
usage() {
|
usage() {
|
||||||
@@ -80,6 +186,10 @@ OPTIONS:
|
|||||||
--brand-name <name> Custom brand name
|
--brand-name <name> Custom brand name
|
||||||
--brand-logo <path> Logo path relative to public/
|
--brand-logo <path> Logo path relative to public/
|
||||||
--footer-text <text> Custom footer text
|
--footer-text <text> Custom footer text
|
||||||
|
--ocr-languages <list> Comma-separated OCR languages to bundle
|
||||||
|
(default: eng)
|
||||||
|
--list-ocr-languages Print supported OCR language codes and exit
|
||||||
|
--search-ocr-language Search supported OCR languages by code or name
|
||||||
--skip-docker Skip Docker build and export
|
--skip-docker Skip Docker build and export
|
||||||
--skip-wasm Skip WASM download (reuse existing .tgz files)
|
--skip-wasm Skip WASM download (reuse existing .tgz files)
|
||||||
--help Show this help message
|
--help Show this help message
|
||||||
@@ -91,6 +201,7 @@ EXAMPLES:
|
|||||||
# Full automation
|
# Full automation
|
||||||
bash scripts/prepare-airgap.sh \
|
bash scripts/prepare-airgap.sh \
|
||||||
--wasm-base-url https://internal.example.com/wasm \
|
--wasm-base-url https://internal.example.com/wasm \
|
||||||
|
--ocr-languages eng,deu,fra \
|
||||||
--brand-name "AcmePDF" \
|
--brand-name "AcmePDF" \
|
||||||
--language fr
|
--language fr
|
||||||
|
|
||||||
@@ -98,6 +209,12 @@ EXAMPLES:
|
|||||||
bash scripts/prepare-airgap.sh \
|
bash scripts/prepare-airgap.sh \
|
||||||
--wasm-base-url https://internal.example.com/wasm \
|
--wasm-base-url https://internal.example.com/wasm \
|
||||||
--skip-docker
|
--skip-docker
|
||||||
|
|
||||||
|
# Show all supported OCR language codes
|
||||||
|
bash scripts/prepare-airgap.sh --list-ocr-languages
|
||||||
|
|
||||||
|
# Search OCR languages by code or human-readable name
|
||||||
|
bash scripts/prepare-airgap.sh --search-ocr-language german
|
||||||
EOF
|
EOF
|
||||||
exit 0
|
exit 0
|
||||||
}
|
}
|
||||||
@@ -115,6 +232,9 @@ while [[ $# -gt 0 ]]; do
|
|||||||
--brand-name) BRAND_NAME="$2"; shift 2 ;;
|
--brand-name) BRAND_NAME="$2"; shift 2 ;;
|
||||||
--brand-logo) BRAND_LOGO="$2"; shift 2 ;;
|
--brand-logo) BRAND_LOGO="$2"; shift 2 ;;
|
||||||
--footer-text) FOOTER_TEXT="$2"; shift 2 ;;
|
--footer-text) FOOTER_TEXT="$2"; shift 2 ;;
|
||||||
|
--ocr-languages) OCR_LANGUAGES="$2"; shift 2 ;;
|
||||||
|
--list-ocr-languages) LIST_OCR_LANGUAGES=true; shift ;;
|
||||||
|
--search-ocr-language) SEARCH_OCR_LANGUAGE_TERM="$2"; shift 2 ;;
|
||||||
--dockerfile) DOCKERFILE="$2"; shift 2 ;;
|
--dockerfile) DOCKERFILE="$2"; shift 2 ;;
|
||||||
--skip-docker) SKIP_DOCKER=true; shift ;;
|
--skip-docker) SKIP_DOCKER=true; shift ;;
|
||||||
--skip-wasm) SKIP_WASM=true; shift ;;
|
--skip-wasm) SKIP_WASM=true; shift ;;
|
||||||
@@ -132,6 +252,18 @@ if [ ! -f "package.json" ] || [ ! -f "src/js/const/cdn-version.ts" ]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "$LIST_OCR_LANGUAGES" = true ]; then
|
||||||
|
show_supported_ocr_languages
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$SEARCH_OCR_LANGUAGE_TERM" ]; then
|
||||||
|
if show_matching_ocr_languages "$SEARCH_OCR_LANGUAGE_TERM"; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# --- Check prerequisites ---
|
# --- Check prerequisites ---
|
||||||
check_prerequisites() {
|
check_prerequisites() {
|
||||||
local missing=false
|
local missing=false
|
||||||
@@ -141,6 +273,11 @@ check_prerequisites() {
|
|||||||
missing=true
|
missing=true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "$SKIP_WASM" = false ] && ! command -v curl &>/dev/null; then
|
||||||
|
error "curl is required to download OCR language data."
|
||||||
|
missing=true
|
||||||
|
fi
|
||||||
|
|
||||||
if [ "$SKIP_DOCKER" = false ] && ! command -v docker &>/dev/null; then
|
if [ "$SKIP_DOCKER" = false ] && ! command -v docker &>/dev/null; then
|
||||||
error "docker is required but not found (use --skip-docker to skip)."
|
error "docker is required but not found (use --skip-docker to skip)."
|
||||||
missing=true
|
missing=true
|
||||||
@@ -156,9 +293,11 @@ read_versions() {
|
|||||||
PYMUPDF_VERSION=$(grep "pymupdf:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'")
|
PYMUPDF_VERSION=$(grep "pymupdf:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'")
|
||||||
GS_VERSION=$(grep "ghostscript:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'")
|
GS_VERSION=$(grep "ghostscript:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'")
|
||||||
APP_VERSION=$(node -p "require('./package.json').version")
|
APP_VERSION=$(node -p "require('./package.json').version")
|
||||||
|
TESSERACT_VERSION=$(node -p "require('./package-lock.json').packages['node_modules/tesseract.js'].version")
|
||||||
|
TESSERACT_CORE_VERSION=$(node -p "require('./package-lock.json').packages['node_modules/tesseract.js-core'].version")
|
||||||
|
|
||||||
if [ -z "$PYMUPDF_VERSION" ] || [ -z "$GS_VERSION" ]; then
|
if [ -z "$PYMUPDF_VERSION" ] || [ -z "$GS_VERSION" ] || [ -z "$TESSERACT_VERSION" ] || [ -z "$TESSERACT_CORE_VERSION" ]; then
|
||||||
error "Failed to read WASM versions from src/js/const/cdn-version.ts"
|
error "Failed to read external asset versions from the repository metadata"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@@ -175,6 +314,8 @@ interactive_mode() {
|
|||||||
echo " PyMuPDF: ${PYMUPDF_VERSION}"
|
echo " PyMuPDF: ${PYMUPDF_VERSION}"
|
||||||
echo " Ghostscript: ${GS_VERSION}"
|
echo " Ghostscript: ${GS_VERSION}"
|
||||||
echo " CoherentPDF: latest"
|
echo " CoherentPDF: latest"
|
||||||
|
echo " Tesseract.js: ${TESSERACT_VERSION}"
|
||||||
|
echo " OCR Data: ${TESSDATA_VERSION}"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
# [1] WASM base URL (REQUIRED)
|
# [1] WASM base URL (REQUIRED)
|
||||||
@@ -256,8 +397,35 @@ interactive_mode() {
|
|||||||
DOCKERFILE="${input:-$DOCKERFILE}"
|
DOCKERFILE="${input:-$DOCKERFILE}"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
# [8] Output directory (optional)
|
# [8] OCR languages (optional)
|
||||||
echo -e "${BOLD}[8/8] Output Directory ${GREEN}(optional)${NC}"
|
echo -e "${BOLD}[8/9] OCR Languages ${GREEN}(optional)${NC}"
|
||||||
|
echo " Comma-separated traineddata files to bundle for offline OCR."
|
||||||
|
echo " Enter Tesseract language codes such as: eng,deu,fra,spa"
|
||||||
|
echo " Type 'list' to print the full supported language list."
|
||||||
|
echo " Type 'search <term>' to find codes by name or abbreviation."
|
||||||
|
while true; do
|
||||||
|
read -r -p " OCR languages [${OCR_LANGUAGES}]: " input
|
||||||
|
if [ -z "${input:-}" ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
if [ "$input" = "list" ]; then
|
||||||
|
show_supported_ocr_languages
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [[ "$input" == search\ * ]]; then
|
||||||
|
search_query="${input#search }"
|
||||||
|
if ! show_matching_ocr_languages "$search_query"; then
|
||||||
|
warn "No OCR language matched '${search_query}'."
|
||||||
|
fi
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
OCR_LANGUAGES="$input"
|
||||||
|
break
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# [9] Output directory (optional)
|
||||||
|
echo -e "${BOLD}[9/9] Output Directory ${GREEN}(optional)${NC}"
|
||||||
read -r -p " Path [${OUTPUT_DIR}]: " input
|
read -r -p " Path [${OUTPUT_DIR}]: " input
|
||||||
OUTPUT_DIR="${input:-$OUTPUT_DIR}"
|
OUTPUT_DIR="${input:-$OUTPUT_DIR}"
|
||||||
|
|
||||||
@@ -274,6 +442,7 @@ interactive_mode() {
|
|||||||
[ -n "$BRAND_NAME" ] && echo " Brand Logo: ${BRAND_LOGO:-images/favicon-no-bg.svg (default)}"
|
[ -n "$BRAND_NAME" ] && echo " Brand Logo: ${BRAND_LOGO:-images/favicon-no-bg.svg (default)}"
|
||||||
[ -n "$BRAND_NAME" ] && echo " Footer Text: ${FOOTER_TEXT:-(default)}"
|
[ -n "$BRAND_NAME" ] && echo " Footer Text: ${FOOTER_TEXT:-(default)}"
|
||||||
echo " Base URL: ${BASE_URL:-/ (root)}"
|
echo " Base URL: ${BASE_URL:-/ (root)}"
|
||||||
|
echo " OCR Languages: ${OCR_LANGUAGES}"
|
||||||
echo " Output: ${OUTPUT_DIR}"
|
echo " Output: ${OUTPUT_DIR}"
|
||||||
echo ""
|
echo ""
|
||||||
read -r -p " Proceed? (Y/n): " input
|
read -r -p " Proceed? (Y/n): " input
|
||||||
@@ -321,6 +490,7 @@ filesize() {
|
|||||||
|
|
||||||
check_prerequisites
|
check_prerequisites
|
||||||
read_versions
|
read_versions
|
||||||
|
load_supported_ocr_languages
|
||||||
|
|
||||||
# If no WASM base URL provided, go interactive
|
# If no WASM base URL provided, go interactive
|
||||||
if [ -z "$WASM_BASE_URL" ]; then
|
if [ -z "$WASM_BASE_URL" ]; then
|
||||||
@@ -338,6 +508,34 @@ if [ -n "$LANGUAGE" ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
IFS=',' read -r -a OCR_LANGUAGE_ARRAY <<< "$OCR_LANGUAGES"
|
||||||
|
NORMALIZED_OCR_LANGUAGES=()
|
||||||
|
for raw_lang in "${OCR_LANGUAGE_ARRAY[@]}"; do
|
||||||
|
lang=$(echo "$raw_lang" | tr -d '[:space:]')
|
||||||
|
if [ -z "$lang" ]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [[ ! "$lang" =~ ^[a-z0-9_]+$ ]]; then
|
||||||
|
error "Invalid OCR language code: ${lang}"
|
||||||
|
error "Use comma-separated Tesseract codes such as eng,deu,fra,chi_sim"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! is_supported_ocr_language "$lang"; then
|
||||||
|
error "Unsupported OCR language code: ${lang}"
|
||||||
|
error "Run with --list-ocr-languages or --search-ocr-language <term> to find supported Tesseract codes."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
NORMALIZED_OCR_LANGUAGES+=("$lang")
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ ${#NORMALIZED_OCR_LANGUAGES[@]} -eq 0 ]; then
|
||||||
|
error "At least one OCR language must be included."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
OCR_LANGUAGES=$(IFS=','; echo "${NORMALIZED_OCR_LANGUAGES[*]}")
|
||||||
|
load_required_ocr_fonts
|
||||||
|
|
||||||
# Validate WASM base URL format
|
# Validate WASM base URL format
|
||||||
if [[ ! "$WASM_BASE_URL" =~ ^https?:// ]]; then
|
if [[ ! "$WASM_BASE_URL" =~ ^https?:// ]]; then
|
||||||
error "WASM base URL must start with http:// or https://"
|
error "WASM base URL must start with http:// or https://"
|
||||||
@@ -353,11 +551,15 @@ WASM_BASE_URL="${WASM_BASE_URL%/}"
|
|||||||
WASM_PYMUPDF_URL="${WASM_BASE_URL}/pymupdf/"
|
WASM_PYMUPDF_URL="${WASM_BASE_URL}/pymupdf/"
|
||||||
WASM_GS_URL="${WASM_BASE_URL}/gs/"
|
WASM_GS_URL="${WASM_BASE_URL}/gs/"
|
||||||
WASM_CPDF_URL="${WASM_BASE_URL}/cpdf/"
|
WASM_CPDF_URL="${WASM_BASE_URL}/cpdf/"
|
||||||
|
OCR_TESSERACT_WORKER_URL="${WASM_BASE_URL}/ocr/worker.min.js"
|
||||||
|
OCR_TESSERACT_CORE_URL="${WASM_BASE_URL}/ocr/core"
|
||||||
|
OCR_TESSERACT_LANG_URL="${WASM_BASE_URL}/ocr/lang-data"
|
||||||
|
OCR_FONT_BASE_URL="${WASM_BASE_URL}/ocr/fonts"
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo -e "${BOLD}============================================================${NC}"
|
echo -e "${BOLD}============================================================${NC}"
|
||||||
echo -e "${BOLD} BentoPDF Air-Gapped Bundle Preparation${NC}"
|
echo -e "${BOLD} BentoPDF Air-Gapped Bundle Preparation${NC}"
|
||||||
echo -e "${BOLD} App: v${APP_VERSION} | PyMuPDF: ${PYMUPDF_VERSION} | GS: ${GS_VERSION}${NC}"
|
echo -e "${BOLD} App: v${APP_VERSION} | PyMuPDF: ${PYMUPDF_VERSION} | GS: ${GS_VERSION} | OCR: ${TESSERACT_VERSION}${NC}"
|
||||||
echo -e "${BOLD}============================================================${NC}"
|
echo -e "${BOLD}============================================================${NC}"
|
||||||
|
|
||||||
# --- Phase 1: Prepare output directory ---
|
# --- Phase 1: Prepare output directory ---
|
||||||
@@ -398,6 +600,27 @@ if [ "$SKIP_WASM" = true ]; then
|
|||||||
error "Missing: coherentpdf-*.tgz"
|
error "Missing: coherentpdf-*.tgz"
|
||||||
wasm_missing=true
|
wasm_missing=true
|
||||||
fi
|
fi
|
||||||
|
if ! ls "$OUTPUT_DIR"/tesseract.js-*.tgz &>/dev/null; then
|
||||||
|
error "Missing: tesseract.js-*.tgz"
|
||||||
|
wasm_missing=true
|
||||||
|
fi
|
||||||
|
if ! ls "$OUTPUT_DIR"/tesseract.js-core-*.tgz &>/dev/null; then
|
||||||
|
error "Missing: tesseract.js-core-*.tgz"
|
||||||
|
wasm_missing=true
|
||||||
|
fi
|
||||||
|
for lang in "${NORMALIZED_OCR_LANGUAGES[@]}"; do
|
||||||
|
if [ ! -f "$OUTPUT_DIR/tesseract-langdata/${lang}.traineddata.gz" ]; then
|
||||||
|
error "Missing: tesseract-langdata/${lang}.traineddata.gz"
|
||||||
|
wasm_missing=true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
while IFS=$'\t' read -r font_family font_url font_file; do
|
||||||
|
[ -z "$font_file" ] && continue
|
||||||
|
if [ ! -f "$OUTPUT_DIR/ocr-fonts/${font_file}" ]; then
|
||||||
|
error "Missing: ocr-fonts/${font_file} (${font_family})"
|
||||||
|
wasm_missing=true
|
||||||
|
fi
|
||||||
|
done <<< "$OCR_FONT_MANIFEST_RAW"
|
||||||
if [ "$wasm_missing" = true ]; then
|
if [ "$wasm_missing" = true ]; then
|
||||||
error "Run without --skip-wasm first to download the packages."
|
error "Run without --skip-wasm first to download the packages."
|
||||||
exit 1
|
exit 1
|
||||||
@@ -430,8 +653,42 @@ else
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
info "Downloading tesseract.js@${TESSERACT_VERSION}..."
|
||||||
|
if ! (cd "$WASM_TMP" && npm pack "tesseract.js@${TESSERACT_VERSION}" --quiet 2>&1); then
|
||||||
|
error "Failed to download tesseract.js@${TESSERACT_VERSION}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
info "Downloading tesseract.js-core@${TESSERACT_CORE_VERSION}..."
|
||||||
|
if ! (cd "$WASM_TMP" && npm pack "tesseract.js-core@${TESSERACT_CORE_VERSION}" --quiet 2>&1); then
|
||||||
|
error "Failed to download tesseract.js-core@${TESSERACT_CORE_VERSION}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# Move to output directory
|
# Move to output directory
|
||||||
mv "$WASM_TMP"/*.tgz "$OUTPUT_DIR/"
|
mv "$WASM_TMP"/*.tgz "$OUTPUT_DIR/"
|
||||||
|
|
||||||
|
mkdir -p "$OUTPUT_DIR/tesseract-langdata"
|
||||||
|
for lang in "${NORMALIZED_OCR_LANGUAGES[@]}"; do
|
||||||
|
info "Downloading OCR language data: ${lang}..."
|
||||||
|
if ! curl -fsSL "https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/${TESSDATA_VERSION}/${lang}.traineddata.gz" -o "$OUTPUT_DIR/tesseract-langdata/${lang}.traineddata.gz"; then
|
||||||
|
error "Failed to download OCR language data for ${lang}"
|
||||||
|
error "Check that the language code exists and that the network can reach jsDelivr."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
mkdir -p "$OUTPUT_DIR/ocr-fonts"
|
||||||
|
while IFS=$'\t' read -r font_family font_url font_file; do
|
||||||
|
[ -z "$font_file" ] && continue
|
||||||
|
info "Downloading OCR font: ${font_family}..."
|
||||||
|
if ! curl -fsSL "$font_url" -o "$OUTPUT_DIR/ocr-fonts/${font_file}"; then
|
||||||
|
error "Failed to download OCR font '${font_family}'"
|
||||||
|
error "Check that the network can reach the font URL: ${font_url}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done <<< "$OCR_FONT_MANIFEST_RAW"
|
||||||
|
|
||||||
rm -rf "$WASM_TMP"
|
rm -rf "$WASM_TMP"
|
||||||
trap - EXIT
|
trap - EXIT
|
||||||
|
|
||||||
@@ -443,6 +700,10 @@ else
|
|||||||
info " PyMuPDF: $(filesize "$OUTPUT_DIR"/bentopdf-pymupdf-wasm-*.tgz)"
|
info " PyMuPDF: $(filesize "$OUTPUT_DIR"/bentopdf-pymupdf-wasm-*.tgz)"
|
||||||
info " Ghostscript: $(filesize "$OUTPUT_DIR"/bentopdf-gs-wasm-*.tgz)"
|
info " Ghostscript: $(filesize "$OUTPUT_DIR"/bentopdf-gs-wasm-*.tgz)"
|
||||||
info " CoherentPDF: $(filesize "$CPDF_TGZ") (v${CPDF_VERSION})"
|
info " CoherentPDF: $(filesize "$CPDF_TGZ") (v${CPDF_VERSION})"
|
||||||
|
info " Tesseract.js: $(filesize "$OUTPUT_DIR"/tesseract.js-*.tgz)"
|
||||||
|
info " OCR Core: $(filesize "$OUTPUT_DIR"/tesseract.js-core-*.tgz)"
|
||||||
|
info " OCR Langs: ${OCR_LANGUAGES}"
|
||||||
|
info " OCR Fonts: $(printf '%s\n' "$OCR_FONT_MANIFEST_RAW" | awk -F '\t' 'NF >= 1 { print $1 }' | paste -sd ', ' -)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Resolve CPDF version if we skipped download
|
# Resolve CPDF version if we skipped download
|
||||||
@@ -488,6 +749,11 @@ else
|
|||||||
BUILD_ARGS+=(--build-arg "VITE_WASM_PYMUPDF_URL=${WASM_PYMUPDF_URL}")
|
BUILD_ARGS+=(--build-arg "VITE_WASM_PYMUPDF_URL=${WASM_PYMUPDF_URL}")
|
||||||
BUILD_ARGS+=(--build-arg "VITE_WASM_GS_URL=${WASM_GS_URL}")
|
BUILD_ARGS+=(--build-arg "VITE_WASM_GS_URL=${WASM_GS_URL}")
|
||||||
BUILD_ARGS+=(--build-arg "VITE_WASM_CPDF_URL=${WASM_CPDF_URL}")
|
BUILD_ARGS+=(--build-arg "VITE_WASM_CPDF_URL=${WASM_CPDF_URL}")
|
||||||
|
BUILD_ARGS+=(--build-arg "VITE_TESSERACT_WORKER_URL=${OCR_TESSERACT_WORKER_URL}")
|
||||||
|
BUILD_ARGS+=(--build-arg "VITE_TESSERACT_CORE_URL=${OCR_TESSERACT_CORE_URL}")
|
||||||
|
BUILD_ARGS+=(--build-arg "VITE_TESSERACT_LANG_URL=${OCR_TESSERACT_LANG_URL}")
|
||||||
|
BUILD_ARGS+=(--build-arg "VITE_TESSERACT_AVAILABLE_LANGUAGES=${OCR_LANGUAGES}")
|
||||||
|
BUILD_ARGS+=(--build-arg "VITE_OCR_FONT_BASE_URL=${OCR_FONT_BASE_URL}")
|
||||||
|
|
||||||
[ -n "$SIMPLE_MODE" ] && BUILD_ARGS+=(--build-arg "SIMPLE_MODE=${SIMPLE_MODE}")
|
[ -n "$SIMPLE_MODE" ] && BUILD_ARGS+=(--build-arg "SIMPLE_MODE=${SIMPLE_MODE}")
|
||||||
[ -n "$BASE_URL" ] && BUILD_ARGS+=(--build-arg "BASE_URL=${BASE_URL}")
|
[ -n "$BASE_URL" ] && BUILD_ARGS+=(--build-arg "BASE_URL=${BASE_URL}")
|
||||||
@@ -503,6 +769,12 @@ else
|
|||||||
info " PyMuPDF: ${WASM_PYMUPDF_URL}"
|
info " PyMuPDF: ${WASM_PYMUPDF_URL}"
|
||||||
info " Ghostscript: ${WASM_GS_URL}"
|
info " Ghostscript: ${WASM_GS_URL}"
|
||||||
info " CoherentPDF: ${WASM_CPDF_URL}"
|
info " CoherentPDF: ${WASM_CPDF_URL}"
|
||||||
|
info "OCR URLs:"
|
||||||
|
info " Worker: ${OCR_TESSERACT_WORKER_URL}"
|
||||||
|
info " Core: ${OCR_TESSERACT_CORE_URL}"
|
||||||
|
info " Lang Data: ${OCR_TESSERACT_LANG_URL}"
|
||||||
|
info " Font Base: ${OCR_FONT_BASE_URL}"
|
||||||
|
info " Languages: ${OCR_LANGUAGES}"
|
||||||
echo ""
|
echo ""
|
||||||
info "Building... this may take a few minutes (npm install + Vite build)."
|
info "Building... this may take a few minutes (npm install + Vite build)."
|
||||||
echo ""
|
echo ""
|
||||||
@@ -582,7 +854,7 @@ fi
|
|||||||
echo ""
|
echo ""
|
||||||
echo "[2/3] Extracting WASM packages to \${WASM_DIR}..."
|
echo "[2/3] Extracting WASM packages to \${WASM_DIR}..."
|
||||||
|
|
||||||
mkdir -p "\${WASM_DIR}/pymupdf" "\${WASM_DIR}/gs" "\${WASM_DIR}/cpdf"
|
mkdir -p "\${WASM_DIR}/pymupdf" "\${WASM_DIR}/gs" "\${WASM_DIR}/cpdf" "\${WASM_DIR}/ocr/core" "\${WASM_DIR}/ocr/lang-data" "\${WASM_DIR}/ocr/fonts"
|
||||||
|
|
||||||
# PyMuPDF: package has dist/ and assets/ at root
|
# PyMuPDF: package has dist/ and assets/ at root
|
||||||
echo " Extracting PyMuPDF..."
|
echo " Extracting PyMuPDF..."
|
||||||
@@ -610,12 +882,35 @@ else
|
|||||||
fi
|
fi
|
||||||
rm -rf "\${TEMP_CPDF}"
|
rm -rf "\${TEMP_CPDF}"
|
||||||
|
|
||||||
|
# Tesseract worker: browser expects a single worker.min.js file
|
||||||
|
echo " Extracting Tesseract worker..."
|
||||||
|
TEMP_TESS="\$(mktemp -d)"
|
||||||
|
tar xzf "\${SCRIPT_DIR}"/tesseract.js-*.tgz -C "\${TEMP_TESS}"
|
||||||
|
cp "\${TEMP_TESS}/package/dist/worker.min.js" "\${WASM_DIR}/ocr/worker.min.js"
|
||||||
|
rm -rf "\${TEMP_TESS}"
|
||||||
|
|
||||||
|
# Tesseract core: browser expects the full tesseract.js-core directory
|
||||||
|
echo " Extracting Tesseract core..."
|
||||||
|
tar xzf "\${SCRIPT_DIR}"/tesseract.js-core-*.tgz -C "\${WASM_DIR}/ocr/core" --strip-components=1
|
||||||
|
|
||||||
|
# OCR language data: copy the bundled traineddata files
|
||||||
|
echo " Installing OCR language data..."
|
||||||
|
cp "\${SCRIPT_DIR}"/tesseract-langdata/*.traineddata.gz "\${WASM_DIR}/ocr/lang-data/"
|
||||||
|
|
||||||
|
# OCR fonts: copy the bundled font files for searchable text layer rendering
|
||||||
|
echo " Installing OCR fonts..."
|
||||||
|
cp "\${SCRIPT_DIR}"/ocr-fonts/* "\${WASM_DIR}/ocr/fonts/"
|
||||||
|
|
||||||
echo " WASM files extracted to: \${WASM_DIR}"
|
echo " WASM files extracted to: \${WASM_DIR}"
|
||||||
echo ""
|
echo ""
|
||||||
echo " IMPORTANT: Ensure these paths are served by your internal web server:"
|
echo " IMPORTANT: Ensure these paths are served by your internal web server:"
|
||||||
echo " \${WASM_BASE_URL}/pymupdf/ -> \${WASM_DIR}/pymupdf/"
|
echo " \${WASM_BASE_URL}/pymupdf/ -> \${WASM_DIR}/pymupdf/"
|
||||||
echo " \${WASM_BASE_URL}/gs/ -> \${WASM_DIR}/gs/"
|
echo " \${WASM_BASE_URL}/gs/ -> \${WASM_DIR}/gs/"
|
||||||
echo " \${WASM_BASE_URL}/cpdf/ -> \${WASM_DIR}/cpdf/"
|
echo " \${WASM_BASE_URL}/cpdf/ -> \${WASM_DIR}/cpdf/"
|
||||||
|
echo " \${WASM_BASE_URL}/ocr/worker.min.js -> \${WASM_DIR}/ocr/worker.min.js"
|
||||||
|
echo " \${WASM_BASE_URL}/ocr/core -> \${WASM_DIR}/ocr/core/"
|
||||||
|
echo " \${WASM_BASE_URL}/ocr/lang-data -> \${WASM_DIR}/ocr/lang-data/"
|
||||||
|
echo " \${WASM_BASE_URL}/ocr/fonts -> \${WASM_DIR}/ocr/fonts/"
|
||||||
|
|
||||||
# --- Step 3: Start BentoPDF ---
|
# --- Step 3: Start BentoPDF ---
|
||||||
echo ""
|
echo ""
|
||||||
@@ -654,6 +949,10 @@ cat > "$OUTPUT_DIR/README.md" <<README_EOF
|
|||||||
| \`bentopdf-pymupdf-wasm-${PYMUPDF_VERSION}.tgz\` | PyMuPDF WASM module |
|
| \`bentopdf-pymupdf-wasm-${PYMUPDF_VERSION}.tgz\` | PyMuPDF WASM module |
|
||||||
| \`bentopdf-gs-wasm-${GS_VERSION}.tgz\` | Ghostscript WASM module |
|
| \`bentopdf-gs-wasm-${GS_VERSION}.tgz\` | Ghostscript WASM module |
|
||||||
| \`coherentpdf-${CPDF_VERSION}.tgz\` | CoherentPDF WASM module |
|
| \`coherentpdf-${CPDF_VERSION}.tgz\` | CoherentPDF WASM module |
|
||||||
|
| \`tesseract.js-${TESSERACT_VERSION}.tgz\` | Tesseract browser worker package |
|
||||||
|
| \`tesseract.js-core-${TESSERACT_CORE_VERSION}.tgz\` | Tesseract core runtime package |
|
||||||
|
| \`tesseract-langdata/\` | OCR language data files (${OCR_LANGUAGES}) |
|
||||||
|
| \`ocr-fonts/\` | OCR text-layer font files |
|
||||||
| \`setup.sh\` | Automated setup script |
|
| \`setup.sh\` | Automated setup script |
|
||||||
| \`README.md\` | This file |
|
| \`README.md\` | This file |
|
||||||
|
|
||||||
@@ -664,6 +963,16 @@ The Docker image was built with these WASM URLs:
|
|||||||
- **PyMuPDF:** \`${WASM_PYMUPDF_URL}\`
|
- **PyMuPDF:** \`${WASM_PYMUPDF_URL}\`
|
||||||
- **Ghostscript:** \`${WASM_GS_URL}\`
|
- **Ghostscript:** \`${WASM_GS_URL}\`
|
||||||
- **CoherentPDF:** \`${WASM_CPDF_URL}\`
|
- **CoherentPDF:** \`${WASM_CPDF_URL}\`
|
||||||
|
- **OCR Worker:** \`${OCR_TESSERACT_WORKER_URL}\`
|
||||||
|
- **OCR Core:** \`${OCR_TESSERACT_CORE_URL}\`
|
||||||
|
- **OCR Lang Data:** \`${OCR_TESSERACT_LANG_URL}\`
|
||||||
|
- **OCR Font Base:** \`${OCR_FONT_BASE_URL}\`
|
||||||
|
|
||||||
|
Bundled OCR languages: **${OCR_LANGUAGES}**
|
||||||
|
|
||||||
|
Bundled OCR fonts:
|
||||||
|
|
||||||
|
$(printf '%s\n' "$OCR_FONT_MANIFEST_RAW" | awk -F '\t' 'NF >= 3 { printf "- **%s** -> `%s`\n", $1, $3 }')
|
||||||
|
|
||||||
These URLs are baked into the app at build time. The user's browser fetches
|
These URLs are baked into the app at build time. The user's browser fetches
|
||||||
WASM files from these URLs at runtime.
|
WASM files from these URLs at runtime.
|
||||||
@@ -694,7 +1003,7 @@ docker load -i bentopdf.tar
|
|||||||
Extract to your internal web server's document root:
|
Extract to your internal web server's document root:
|
||||||
|
|
||||||
\`\`\`bash
|
\`\`\`bash
|
||||||
mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf
|
mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts
|
||||||
|
|
||||||
# PyMuPDF
|
# PyMuPDF
|
||||||
tar xzf bentopdf-pymupdf-wasm-${PYMUPDF_VERSION}.tgz -C ./wasm/pymupdf --strip-components=1
|
tar xzf bentopdf-pymupdf-wasm-${PYMUPDF_VERSION}.tgz -C ./wasm/pymupdf --strip-components=1
|
||||||
@@ -710,6 +1019,21 @@ TEMP_CPDF=\$(mktemp -d)
|
|||||||
tar xzf coherentpdf-${CPDF_VERSION}.tgz -C \$TEMP_CPDF
|
tar xzf coherentpdf-${CPDF_VERSION}.tgz -C \$TEMP_CPDF
|
||||||
cp -r \$TEMP_CPDF/package/dist/* ./wasm/cpdf/
|
cp -r \$TEMP_CPDF/package/dist/* ./wasm/cpdf/
|
||||||
rm -rf \$TEMP_CPDF
|
rm -rf \$TEMP_CPDF
|
||||||
|
|
||||||
|
# Tesseract worker
|
||||||
|
TEMP_TESS=\$(mktemp -d)
|
||||||
|
tar xzf tesseract.js-${TESSERACT_VERSION}.tgz -C \$TEMP_TESS
|
||||||
|
cp \$TEMP_TESS/package/dist/worker.min.js ./wasm/ocr/worker.min.js
|
||||||
|
rm -rf \$TEMP_TESS
|
||||||
|
|
||||||
|
# Tesseract core
|
||||||
|
tar xzf tesseract.js-core-${TESSERACT_CORE_VERSION}.tgz -C ./wasm/ocr/core --strip-components=1
|
||||||
|
|
||||||
|
# OCR language data
|
||||||
|
cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/
|
||||||
|
|
||||||
|
# OCR fonts
|
||||||
|
cp ./ocr-fonts/* ./wasm/ocr/fonts/
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
### 3. Configure your web server
|
### 3. Configure your web server
|
||||||
@@ -721,6 +1045,10 @@ Ensure these paths are accessible at the configured URLs:
|
|||||||
| \`${WASM_PYMUPDF_URL}\` | \`./wasm/pymupdf/\` |
|
| \`${WASM_PYMUPDF_URL}\` | \`./wasm/pymupdf/\` |
|
||||||
| \`${WASM_GS_URL}\` | \`./wasm/gs/\` |
|
| \`${WASM_GS_URL}\` | \`./wasm/gs/\` |
|
||||||
| \`${WASM_CPDF_URL}\` | \`./wasm/cpdf/\` |
|
| \`${WASM_CPDF_URL}\` | \`./wasm/cpdf/\` |
|
||||||
|
| \`${OCR_TESSERACT_WORKER_URL}\` | \`./wasm/ocr/worker.min.js\` |
|
||||||
|
| \`${OCR_TESSERACT_CORE_URL}\` | \`./wasm/ocr/core/\` |
|
||||||
|
| \`${OCR_TESSERACT_LANG_URL}\` | \`./wasm/ocr/lang-data/\` |
|
||||||
|
| \`${OCR_FONT_BASE_URL}\` | \`./wasm/ocr/fonts/\` |
|
||||||
|
|
||||||
### 4. Run BentoPDF
|
### 4. Run BentoPDF
|
||||||
|
|
||||||
|
|||||||
@@ -1,37 +1,39 @@
|
|||||||
import Tesseract from 'tesseract.js';
|
import type Tesseract from 'tesseract.js';
|
||||||
|
|
||||||
import type { ComparePageModel, CompareTextItem } from '../types.ts';
|
import type { ComparePageModel, CompareTextItem } from '../types.ts';
|
||||||
import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts';
|
import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts';
|
||||||
import {
|
import {
|
||||||
joinCompareTextItems,
|
joinCompareTextItems,
|
||||||
normalizeCompareText,
|
normalizeCompareText,
|
||||||
} from './text-normalization.ts';
|
} from './text-normalization.ts';
|
||||||
|
import { createConfiguredTesseractWorker } from '../../utils/tesseract-runtime.js';
|
||||||
|
|
||||||
type OcrWord = {
|
type OcrWord = Tesseract.Word;
|
||||||
text: string;
|
type OcrRecognizeResult = Tesseract.RecognizeResult;
|
||||||
bbox: {
|
type OcrPageWithWords = Tesseract.Page & { words: OcrWord[] };
|
||||||
x0: number;
|
|
||||||
y0: number;
|
|
||||||
x1: number;
|
|
||||||
y1: number;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
export async function recognizePageCanvas(
|
export async function recognizePageCanvas(
|
||||||
canvas: HTMLCanvasElement,
|
canvas: HTMLCanvasElement,
|
||||||
language: string,
|
language: string,
|
||||||
onProgress?: (status: string, progress: number) => void
|
onProgress?: (status: string, progress: number) => void
|
||||||
): Promise<ComparePageModel> {
|
): Promise<ComparePageModel> {
|
||||||
const result = await Tesseract.recognize(canvas, language, {
|
const worker = await createConfiguredTesseractWorker(
|
||||||
logger(message) {
|
language,
|
||||||
|
1,
|
||||||
|
(message) => {
|
||||||
onProgress?.(message.status, message.progress || 0);
|
onProgress?.(message.status, message.progress || 0);
|
||||||
},
|
}
|
||||||
});
|
);
|
||||||
|
|
||||||
const ocrData = result.data as unknown as { words?: OcrWord[] };
|
let result: OcrRecognizeResult;
|
||||||
const words = ((ocrData.words || []) as OcrWord[])
|
try {
|
||||||
|
result = await worker.recognize(canvas);
|
||||||
|
} finally {
|
||||||
|
await worker.terminate();
|
||||||
|
}
|
||||||
|
|
||||||
|
const words = (result.data as OcrPageWithWords).words
|
||||||
.map((word, index) => {
|
.map((word, index) => {
|
||||||
const normalizedText = normalizeCompareText(word.text || '');
|
const normalizedText = normalizeCompareText(word.text);
|
||||||
if (!normalizedText) return null;
|
if (!normalizedText) return null;
|
||||||
|
|
||||||
const item: CompareTextItem = {
|
const item: CompareTextItem = {
|
||||||
|
|||||||
@@ -4,186 +4,230 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
export const languageToFontFamily: Record<string, string> = {
|
export const languageToFontFamily: Record<string, string> = {
|
||||||
// CJK Languages
|
// CJK Languages
|
||||||
jpn: 'Noto Sans JP',
|
jpn: 'Noto Sans JP',
|
||||||
chi_sim: 'Noto Sans SC',
|
chi_sim: 'Noto Sans SC',
|
||||||
chi_tra: 'Noto Sans TC',
|
chi_tra: 'Noto Sans TC',
|
||||||
kor: 'Noto Sans KR',
|
kor: 'Noto Sans KR',
|
||||||
|
|
||||||
// Arabic Script
|
// Arabic Script
|
||||||
ara: 'Noto Sans Arabic',
|
ara: 'Noto Sans Arabic',
|
||||||
fas: 'Noto Sans Arabic',
|
fas: 'Noto Sans Arabic',
|
||||||
urd: 'Noto Sans Arabic',
|
urd: 'Noto Sans Arabic',
|
||||||
pus: 'Noto Sans Arabic',
|
pus: 'Noto Sans Arabic',
|
||||||
kur: 'Noto Sans Arabic',
|
kur: 'Noto Sans Arabic',
|
||||||
|
|
||||||
// Devanagari Script
|
// Devanagari Script
|
||||||
hin: 'Noto Sans Devanagari',
|
hin: 'Noto Sans Devanagari',
|
||||||
mar: 'Noto Sans Devanagari',
|
mar: 'Noto Sans Devanagari',
|
||||||
san: 'Noto Sans Devanagari',
|
san: 'Noto Sans Devanagari',
|
||||||
nep: 'Noto Sans Devanagari',
|
nep: 'Noto Sans Devanagari',
|
||||||
|
|
||||||
// Bengali Script
|
// Bengali Script
|
||||||
ben: 'Noto Sans Bengali',
|
ben: 'Noto Sans Bengali',
|
||||||
asm: 'Noto Sans Bengali',
|
asm: 'Noto Sans Bengali',
|
||||||
|
|
||||||
// Tamil Script
|
// Tamil Script
|
||||||
tam: 'Noto Sans Tamil',
|
tam: 'Noto Sans Tamil',
|
||||||
|
|
||||||
// Telugu Script
|
// Telugu Script
|
||||||
tel: 'Noto Sans Telugu',
|
tel: 'Noto Sans Telugu',
|
||||||
|
|
||||||
// Kannada Script
|
// Kannada Script
|
||||||
kan: 'Noto Sans Kannada',
|
kan: 'Noto Sans Kannada',
|
||||||
|
|
||||||
// Malayalam Script
|
// Malayalam Script
|
||||||
mal: 'Noto Sans Malayalam',
|
mal: 'Noto Sans Malayalam',
|
||||||
|
|
||||||
// Gujarati Script
|
// Gujarati Script
|
||||||
guj: 'Noto Sans Gujarati',
|
guj: 'Noto Sans Gujarati',
|
||||||
|
|
||||||
// Gurmukhi Script (Punjabi)
|
// Gurmukhi Script (Punjabi)
|
||||||
pan: 'Noto Sans Gurmukhi',
|
pan: 'Noto Sans Gurmukhi',
|
||||||
|
|
||||||
// Oriya Script
|
// Oriya Script
|
||||||
ori: 'Noto Sans Oriya',
|
ori: 'Noto Sans Oriya',
|
||||||
|
|
||||||
// Sinhala Script
|
// Sinhala Script
|
||||||
sin: 'Noto Sans Sinhala',
|
sin: 'Noto Sans Sinhala',
|
||||||
|
|
||||||
// Thai Script
|
// Thai Script
|
||||||
tha: 'Noto Sans Thai',
|
tha: 'Noto Sans Thai',
|
||||||
|
|
||||||
// Lao Script
|
// Lao Script
|
||||||
lao: 'Noto Sans Lao',
|
lao: 'Noto Sans Lao',
|
||||||
|
|
||||||
// Khmer Script
|
// Khmer Script
|
||||||
khm: 'Noto Sans Khmer',
|
khm: 'Noto Sans Khmer',
|
||||||
|
|
||||||
// Myanmar Script
|
// Myanmar Script
|
||||||
mya: 'Noto Sans Myanmar',
|
mya: 'Noto Sans Myanmar',
|
||||||
|
|
||||||
// Tibetan Script
|
// Tibetan Script
|
||||||
bod: 'Noto Serif Tibetan',
|
bod: 'Noto Serif Tibetan',
|
||||||
|
|
||||||
// Georgian Script
|
// Georgian Script
|
||||||
kat: 'Noto Sans Georgian',
|
kat: 'Noto Sans Georgian',
|
||||||
kat_old: 'Noto Sans Georgian',
|
kat_old: 'Noto Sans Georgian',
|
||||||
|
|
||||||
// Armenian Script
|
// Armenian Script
|
||||||
hye: 'Noto Sans Armenian',
|
hye: 'Noto Sans Armenian',
|
||||||
|
|
||||||
// Hebrew Script
|
// Hebrew Script
|
||||||
heb: 'Noto Sans Hebrew',
|
heb: 'Noto Sans Hebrew',
|
||||||
yid: 'Noto Sans Hebrew',
|
yid: 'Noto Sans Hebrew',
|
||||||
|
|
||||||
// Ethiopic Script
|
// Ethiopic Script
|
||||||
amh: 'Noto Sans Ethiopic',
|
amh: 'Noto Sans Ethiopic',
|
||||||
tir: 'Noto Sans Ethiopic',
|
tir: 'Noto Sans Ethiopic',
|
||||||
|
|
||||||
// Cherokee Script
|
// Cherokee Script
|
||||||
chr: 'Noto Sans Cherokee',
|
chr: 'Noto Sans Cherokee',
|
||||||
|
|
||||||
// Syriac Script
|
// Syriac Script
|
||||||
syr: 'Noto Sans Syriac',
|
syr: 'Noto Sans Syriac',
|
||||||
|
|
||||||
// Cyrillic Script (Noto Sans includes Cyrillic)
|
// Cyrillic Script (Noto Sans includes Cyrillic)
|
||||||
bel: 'Noto Sans',
|
bel: 'Noto Sans',
|
||||||
bul: 'Noto Sans',
|
bul: 'Noto Sans',
|
||||||
mkd: 'Noto Sans',
|
mkd: 'Noto Sans',
|
||||||
rus: 'Noto Sans',
|
rus: 'Noto Sans',
|
||||||
srp: 'Noto Sans',
|
srp: 'Noto Sans',
|
||||||
srp_latn: 'Noto Sans',
|
srp_latn: 'Noto Sans',
|
||||||
ukr: 'Noto Sans',
|
ukr: 'Noto Sans',
|
||||||
kaz: 'Noto Sans',
|
kaz: 'Noto Sans',
|
||||||
kir: 'Noto Sans',
|
kir: 'Noto Sans',
|
||||||
tgk: 'Noto Sans',
|
tgk: 'Noto Sans',
|
||||||
uzb: 'Noto Sans',
|
uzb: 'Noto Sans',
|
||||||
uzb_cyrl: 'Noto Sans',
|
uzb_cyrl: 'Noto Sans',
|
||||||
aze_cyrl: 'Noto Sans',
|
aze_cyrl: 'Noto Sans',
|
||||||
|
|
||||||
// Latin Script (covered by base Noto Sans)
|
// Latin Script (covered by base Noto Sans)
|
||||||
afr: 'Noto Sans',
|
afr: 'Noto Sans',
|
||||||
aze: 'Noto Sans',
|
aze: 'Noto Sans',
|
||||||
bos: 'Noto Sans',
|
bos: 'Noto Sans',
|
||||||
cat: 'Noto Sans',
|
cat: 'Noto Sans',
|
||||||
ceb: 'Noto Sans',
|
ceb: 'Noto Sans',
|
||||||
ces: 'Noto Sans',
|
ces: 'Noto Sans',
|
||||||
cym: 'Noto Sans',
|
cym: 'Noto Sans',
|
||||||
dan: 'Noto Sans',
|
dan: 'Noto Sans',
|
||||||
deu: 'Noto Sans',
|
deu: 'Noto Sans',
|
||||||
ell: 'Noto Sans',
|
ell: 'Noto Sans',
|
||||||
eng: 'Noto Sans',
|
eng: 'Noto Sans',
|
||||||
enm: 'Noto Sans',
|
enm: 'Noto Sans',
|
||||||
epo: 'Noto Sans',
|
epo: 'Noto Sans',
|
||||||
est: 'Noto Sans',
|
est: 'Noto Sans',
|
||||||
eus: 'Noto Sans',
|
eus: 'Noto Sans',
|
||||||
fin: 'Noto Sans',
|
fin: 'Noto Sans',
|
||||||
fra: 'Noto Sans',
|
fra: 'Noto Sans',
|
||||||
frk: 'Noto Sans',
|
frk: 'Noto Sans',
|
||||||
frm: 'Noto Sans',
|
frm: 'Noto Sans',
|
||||||
gle: 'Noto Sans',
|
gle: 'Noto Sans',
|
||||||
glg: 'Noto Sans',
|
glg: 'Noto Sans',
|
||||||
grc: 'Noto Sans',
|
grc: 'Noto Sans',
|
||||||
hat: 'Noto Sans',
|
hat: 'Noto Sans',
|
||||||
hrv: 'Noto Sans',
|
hrv: 'Noto Sans',
|
||||||
hun: 'Noto Sans',
|
hun: 'Noto Sans',
|
||||||
iku: 'Noto Sans',
|
iku: 'Noto Sans',
|
||||||
ind: 'Noto Sans',
|
ind: 'Noto Sans',
|
||||||
isl: 'Noto Sans',
|
isl: 'Noto Sans',
|
||||||
ita: 'Noto Sans',
|
ita: 'Noto Sans',
|
||||||
ita_old: 'Noto Sans',
|
ita_old: 'Noto Sans',
|
||||||
jav: 'Noto Sans',
|
jav: 'Noto Sans',
|
||||||
lat: 'Noto Sans',
|
lat: 'Noto Sans',
|
||||||
lav: 'Noto Sans',
|
lav: 'Noto Sans',
|
||||||
lit: 'Noto Sans',
|
lit: 'Noto Sans',
|
||||||
mlt: 'Noto Sans',
|
mlt: 'Noto Sans',
|
||||||
msa: 'Noto Sans',
|
msa: 'Noto Sans',
|
||||||
nld: 'Noto Sans',
|
nld: 'Noto Sans',
|
||||||
nor: 'Noto Sans',
|
nor: 'Noto Sans',
|
||||||
pol: 'Noto Sans',
|
pol: 'Noto Sans',
|
||||||
por: 'Noto Sans',
|
por: 'Noto Sans',
|
||||||
ron: 'Noto Sans',
|
ron: 'Noto Sans',
|
||||||
slk: 'Noto Sans',
|
slk: 'Noto Sans',
|
||||||
slv: 'Noto Sans',
|
slv: 'Noto Sans',
|
||||||
spa: 'Noto Sans',
|
spa: 'Noto Sans',
|
||||||
spa_old: 'Noto Sans',
|
spa_old: 'Noto Sans',
|
||||||
sqi: 'Noto Sans',
|
sqi: 'Noto Sans',
|
||||||
swa: 'Noto Sans',
|
swa: 'Noto Sans',
|
||||||
swe: 'Noto Sans',
|
swe: 'Noto Sans',
|
||||||
tgl: 'Noto Sans',
|
tgl: 'Noto Sans',
|
||||||
tur: 'Noto Sans',
|
tur: 'Noto Sans',
|
||||||
vie: 'Noto Sans',
|
vie: 'Noto Sans',
|
||||||
dzo: 'Noto Sans',
|
dzo: 'Noto Sans',
|
||||||
uig: 'Noto Sans',
|
uig: 'Noto Sans',
|
||||||
};
|
};
|
||||||
|
|
||||||
export const fontFamilyToUrl: Record<string, string> = {
|
export const fontFamilyToUrl: Record<string, string> = {
|
||||||
'Noto Sans JP': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf',
|
'Noto Sans JP':
|
||||||
'Noto Sans SC': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf',
|
'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf',
|
||||||
'Noto Sans TC': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf',
|
'Noto Sans SC':
|
||||||
'Noto Sans KR': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Korean/NotoSansCJKkr-Regular.otf',
|
'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf',
|
||||||
'Noto Sans Arabic': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArabic/NotoSansArabic-Regular.ttf',
|
'Noto Sans TC':
|
||||||
'Noto Sans Devanagari': 'https://raw.githack.com/googlefonts/noto-fonts/main/unhinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf',
|
||||||
'Noto Sans Bengali': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf',
|
'Noto Sans KR':
|
||||||
'Noto Sans Gujarati': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGujarati/NotoSansGujarati-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Korean/NotoSansCJKkr-Regular.otf',
|
||||||
'Noto Sans Kannada': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKannada/NotoSansKannada-Regular.ttf',
|
'Noto Sans Arabic':
|
||||||
'Noto Sans Malayalam': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMalayalam/NotoSansMalayalam-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArabic/NotoSansArabic-Regular.ttf',
|
||||||
'Noto Sans Oriya': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansOriya/NotoSansOriya-Regular.ttf',
|
'Noto Sans Devanagari':
|
||||||
'Noto Sans Gurmukhi': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGurmukhi/NotoSansGurmukhi-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-fonts/main/unhinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf',
|
||||||
'Noto Sans Tamil': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTamil/NotoSansTamil-Regular.ttf',
|
'Noto Sans Bengali':
|
||||||
'Noto Sans Telugu': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTelugu/NotoSansTelugu-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf',
|
||||||
'Noto Sans Sinhala': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSinhala/NotoSansSinhala-Regular.ttf',
|
'Noto Sans Gujarati':
|
||||||
'Noto Sans Thai': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGujarati/NotoSansGujarati-Regular.ttf',
|
||||||
'Noto Sans Khmer': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKhmer/NotoSansKhmer-Regular.ttf',
|
'Noto Sans Kannada':
|
||||||
'Noto Sans Lao': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansLao/NotoSansLao-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKannada/NotoSansKannada-Regular.ttf',
|
||||||
'Noto Sans Myanmar': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMyanmar/NotoSansMyanmar-Regular.ttf',
|
'Noto Sans Malayalam':
|
||||||
'Noto Sans Hebrew': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansHebrew/NotoSansHebrew-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMalayalam/NotoSansMalayalam-Regular.ttf',
|
||||||
'Noto Sans Georgian': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGeorgian/NotoSansGeorgian-Regular.ttf',
|
'Noto Sans Oriya':
|
||||||
'Noto Sans Ethiopic': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansEthiopic/NotoSansEthiopic-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansOriya/NotoSansOriya-Regular.ttf',
|
||||||
'Noto Serif Tibetan': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf',
|
'Noto Sans Gurmukhi':
|
||||||
'Noto Sans Cherokee': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansCherokee/NotoSansCherokee-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGurmukhi/NotoSansGurmukhi-Regular.ttf',
|
||||||
'Noto Sans Armenian': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArmenian/NotoSansArmenian-Regular.ttf',
|
'Noto Sans Tamil':
|
||||||
'Noto Sans Syriac': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSyriac/NotoSansSyriac-Regular.ttf',
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTamil/NotoSansTamil-Regular.ttf',
|
||||||
'Noto Sans': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf',
|
'Noto Sans Telugu':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTelugu/NotoSansTelugu-Regular.ttf',
|
||||||
|
'Noto Sans Sinhala':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSinhala/NotoSansSinhala-Regular.ttf',
|
||||||
|
'Noto Sans Thai':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf',
|
||||||
|
'Noto Sans Khmer':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKhmer/NotoSansKhmer-Regular.ttf',
|
||||||
|
'Noto Sans Lao':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansLao/NotoSansLao-Regular.ttf',
|
||||||
|
'Noto Sans Myanmar':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMyanmar/NotoSansMyanmar-Regular.ttf',
|
||||||
|
'Noto Sans Hebrew':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansHebrew/NotoSansHebrew-Regular.ttf',
|
||||||
|
'Noto Sans Georgian':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGeorgian/NotoSansGeorgian-Regular.ttf',
|
||||||
|
'Noto Sans Ethiopic':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansEthiopic/NotoSansEthiopic-Regular.ttf',
|
||||||
|
'Noto Serif Tibetan':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf',
|
||||||
|
'Noto Sans Cherokee':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansCherokee/NotoSansCherokee-Regular.ttf',
|
||||||
|
'Noto Sans Armenian':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArmenian/NotoSansArmenian-Regular.ttf',
|
||||||
|
'Noto Sans Syriac':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSyriac/NotoSansSyriac-Regular.ttf',
|
||||||
|
'Noto Sans':
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf',
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export function getFontUrlForFamily(fontFamily: string): string {
|
||||||
|
return fontFamilyToUrl[fontFamily] || fontFamilyToUrl['Noto Sans'];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getFontAssetFileName(fontFamily: string): string {
|
||||||
|
const defaultUrl = getFontUrlForFamily(fontFamily);
|
||||||
|
const fileName = defaultUrl.split('/').pop();
|
||||||
|
|
||||||
|
if (!fileName) {
|
||||||
|
throw new Error(
|
||||||
|
`Could not resolve a font asset filename for ${fontFamily}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return fileName;
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,6 +4,11 @@ import { downloadFile, formatBytes } from '../utils/helpers.js';
|
|||||||
import { icons, createIcons } from 'lucide';
|
import { icons, createIcons } from 'lucide';
|
||||||
import { OcrState } from '@/types';
|
import { OcrState } from '@/types';
|
||||||
import { performOcr } from '../utils/ocr.js';
|
import { performOcr } from '../utils/ocr.js';
|
||||||
|
import {
|
||||||
|
getAvailableTesseractLanguageEntries,
|
||||||
|
resolveConfiguredTesseractAvailableLanguages,
|
||||||
|
UnsupportedOcrLanguageError,
|
||||||
|
} from '../utils/tesseract-language-availability.js';
|
||||||
|
|
||||||
const pageState: OcrState = {
|
const pageState: OcrState = {
|
||||||
file: null,
|
file: null,
|
||||||
@@ -80,6 +85,30 @@ function resetState() {
|
|||||||
if (processBtn) processBtn.disabled = true;
|
if (processBtn) processBtn.disabled = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function updateLanguageAvailabilityNotice() {
|
||||||
|
const notice = document.getElementById('lang-availability-note');
|
||||||
|
if (!notice) return;
|
||||||
|
|
||||||
|
const configuredLanguages = resolveConfiguredTesseractAvailableLanguages();
|
||||||
|
if (!configuredLanguages) {
|
||||||
|
notice.classList.add('hidden');
|
||||||
|
notice.textContent = '';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const availableEntries = getAvailableTesseractLanguageEntries();
|
||||||
|
if (availableEntries.length === 0) {
|
||||||
|
notice.classList.remove('hidden');
|
||||||
|
notice.textContent =
|
||||||
|
'This deployment does not expose any valid OCR languages. Rebuild it with VITE_TESSERACT_AVAILABLE_LANGUAGES set to valid Tesseract codes.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const availableNames = availableEntries.map(([, name]) => name).join(', ');
|
||||||
|
notice.classList.remove('hidden');
|
||||||
|
notice.textContent = `This deployment bundles OCR for: ${availableNames}.`;
|
||||||
|
}
|
||||||
|
|
||||||
async function runOCR() {
|
async function runOCR() {
|
||||||
const selectedLangs = Array.from(
|
const selectedLangs = Array.from(
|
||||||
document.querySelectorAll('.lang-checkbox:checked')
|
document.querySelectorAll('.lang-checkbox:checked')
|
||||||
@@ -142,10 +171,14 @@ async function runOCR() {
|
|||||||
if (textOutput) textOutput.value = result.fullText.trim();
|
if (textOutput) textOutput.value = result.fullText.trim();
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error(e);
|
console.error(e);
|
||||||
showAlert(
|
if (e instanceof UnsupportedOcrLanguageError) {
|
||||||
'OCR Error',
|
showAlert('OCR Language Not Available', e.message);
|
||||||
'An error occurred during the OCR process. The worker may have failed to load. Please try again.'
|
} else {
|
||||||
);
|
showAlert(
|
||||||
|
'OCR Error',
|
||||||
|
'An error occurred during the OCR process. The worker may have failed to load. Please try again.'
|
||||||
|
);
|
||||||
|
}
|
||||||
if (toolOptions) toolOptions.classList.remove('hidden');
|
if (toolOptions) toolOptions.classList.remove('hidden');
|
||||||
if (ocrProgress) ocrProgress.classList.add('hidden');
|
if (ocrProgress) ocrProgress.classList.add('hidden');
|
||||||
}
|
}
|
||||||
@@ -213,10 +246,21 @@ function populateLanguageList() {
|
|||||||
|
|
||||||
langList.innerHTML = '';
|
langList.innerHTML = '';
|
||||||
|
|
||||||
Object.entries(tesseractLanguages).forEach(function ([code, name]) {
|
const availableEntries = getAvailableTesseractLanguageEntries();
|
||||||
|
if (availableEntries.length === 0) {
|
||||||
|
const emptyState = document.createElement('p');
|
||||||
|
emptyState.className = 'text-sm text-yellow-300 p-2';
|
||||||
|
emptyState.textContent =
|
||||||
|
'No OCR languages are available in this deployment.';
|
||||||
|
langList.appendChild(emptyState);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
availableEntries.forEach(function ([code, name]) {
|
||||||
const label = document.createElement('label');
|
const label = document.createElement('label');
|
||||||
label.className =
|
label.className =
|
||||||
'flex items-center gap-2 p-2 rounded-md hover:bg-gray-700 cursor-pointer';
|
'flex items-center gap-2 p-2 rounded-md hover:bg-gray-700 cursor-pointer';
|
||||||
|
label.dataset.search = `${name} ${code}`.toLowerCase();
|
||||||
|
|
||||||
const checkbox = document.createElement('input');
|
const checkbox = document.createElement('input');
|
||||||
checkbox.type = 'checkbox';
|
checkbox.type = 'checkbox';
|
||||||
@@ -253,6 +297,7 @@ document.addEventListener('DOMContentLoaded', function () {
|
|||||||
const downloadPdfBtn = document.getElementById('download-searchable-pdf');
|
const downloadPdfBtn = document.getElementById('download-searchable-pdf');
|
||||||
|
|
||||||
populateLanguageList();
|
populateLanguageList();
|
||||||
|
updateLanguageAvailabilityNotice();
|
||||||
|
|
||||||
if (backBtn) {
|
if (backBtn) {
|
||||||
backBtn.addEventListener('click', function () {
|
backBtn.addEventListener('click', function () {
|
||||||
@@ -304,9 +349,9 @@ document.addEventListener('DOMContentLoaded', function () {
|
|||||||
langSearch.addEventListener('input', function () {
|
langSearch.addEventListener('input', function () {
|
||||||
const searchTerm = langSearch.value.toLowerCase();
|
const searchTerm = langSearch.value.toLowerCase();
|
||||||
langList.querySelectorAll('label').forEach(function (label) {
|
langList.querySelectorAll('label').forEach(function (label) {
|
||||||
(label as HTMLElement).style.display = label.textContent
|
(label as HTMLElement).style.display = (
|
||||||
?.toLowerCase()
|
label as HTMLElement
|
||||||
.includes(searchTerm)
|
).dataset.search?.includes(searchTerm)
|
||||||
? ''
|
? ''
|
||||||
: 'none';
|
: 'none';
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import { showAlert } from '../ui.js';
|
import { showAlert } from '../ui.js';
|
||||||
import { tesseractLanguages } from '../config/tesseract-languages.js';
|
|
||||||
import { createWorkflowEditor, updateNodeDisplay } from '../workflow/editor';
|
import { createWorkflowEditor, updateNodeDisplay } from '../workflow/editor';
|
||||||
import { executeWorkflow } from '../workflow/engine';
|
import { executeWorkflow } from '../workflow/engine';
|
||||||
|
import { getAvailableTesseractLanguageEntries } from '../utils/tesseract-language-availability.js';
|
||||||
import {
|
import {
|
||||||
nodeRegistry,
|
nodeRegistry,
|
||||||
getNodesByCategory,
|
getNodesByCategory,
|
||||||
@@ -1194,7 +1194,7 @@ function showNodeSettings(node: BaseWorkflowNode) {
|
|||||||
{ label: 'High (288 DPI)', value: '3.0' },
|
{ label: 'High (288 DPI)', value: '3.0' },
|
||||||
{ label: 'Ultra (384 DPI)', value: '4.0' },
|
{ label: 'Ultra (384 DPI)', value: '4.0' },
|
||||||
],
|
],
|
||||||
language: Object.entries(tesseractLanguages).map(([code, name]) => ({
|
language: getAvailableTesseractLanguageEntries().map(([code, name]) => ({
|
||||||
label: name,
|
label: name,
|
||||||
value: code,
|
value: code,
|
||||||
})),
|
})),
|
||||||
|
|||||||
@@ -1,4 +1,8 @@
|
|||||||
import { languageToFontFamily, fontFamilyToUrl } from '../config/font-mappings.js';
|
import {
|
||||||
|
getFontAssetFileName,
|
||||||
|
getFontUrlForFamily,
|
||||||
|
languageToFontFamily,
|
||||||
|
} from '../config/font-mappings.js';
|
||||||
|
|
||||||
const fontCache: Map<string, ArrayBuffer> = new Map();
|
const fontCache: Map<string, ArrayBuffer> = new Map();
|
||||||
|
|
||||||
@@ -6,276 +10,321 @@ const DB_NAME = 'bentopdf-fonts';
|
|||||||
const DB_VERSION = 1;
|
const DB_VERSION = 1;
|
||||||
const STORE_NAME = 'fonts';
|
const STORE_NAME = 'fonts';
|
||||||
|
|
||||||
|
type OcrFontEnv = Partial<Pick<ImportMetaEnv, 'VITE_OCR_FONT_BASE_URL'>>;
|
||||||
|
|
||||||
|
function getDefaultFontEnv(): OcrFontEnv {
|
||||||
|
return import.meta.env;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeFontBaseUrl(url?: string): string | undefined {
|
||||||
|
const trimmed = url?.trim();
|
||||||
|
|
||||||
|
if (!trimmed) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
return trimmed.replace(/\/+$/, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveFontUrl(
|
||||||
|
fontFamily: string,
|
||||||
|
env: OcrFontEnv = getDefaultFontEnv()
|
||||||
|
): string {
|
||||||
|
const fontBaseUrl = normalizeFontBaseUrl(env.VITE_OCR_FONT_BASE_URL);
|
||||||
|
|
||||||
|
if (fontBaseUrl) {
|
||||||
|
return `${fontBaseUrl}/${getFontAssetFileName(fontFamily)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return getFontUrlForFamily(fontFamily);
|
||||||
|
}
|
||||||
|
|
||||||
async function openFontDB(): Promise<IDBDatabase> {
|
async function openFontDB(): Promise<IDBDatabase> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const request = indexedDB.open(DB_NAME, DB_VERSION);
|
||||||
|
|
||||||
|
request.onerror = () => reject(request.error);
|
||||||
|
request.onsuccess = () => resolve(request.result);
|
||||||
|
|
||||||
|
request.onupgradeneeded = (event) => {
|
||||||
|
const db = (event.target as IDBOpenDBRequest).result;
|
||||||
|
if (!db.objectStoreNames.contains(STORE_NAME)) {
|
||||||
|
db.createObjectStore(STORE_NAME);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getCachedFontFromDB(
|
||||||
|
fontFamily: string
|
||||||
|
): Promise<ArrayBuffer | null> {
|
||||||
|
try {
|
||||||
|
const db = await openFontDB();
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
const request = indexedDB.open(DB_NAME, DB_VERSION);
|
const transaction = db.transaction(STORE_NAME, 'readonly');
|
||||||
|
const store = transaction.objectStore(STORE_NAME);
|
||||||
|
const request = store.get(fontFamily);
|
||||||
|
|
||||||
request.onerror = () => reject(request.error);
|
request.onsuccess = () => resolve(request.result || null);
|
||||||
request.onsuccess = () => resolve(request.result);
|
request.onerror = () => reject(request.error);
|
||||||
|
|
||||||
request.onupgradeneeded = (event) => {
|
|
||||||
const db = (event.target as IDBOpenDBRequest).result;
|
|
||||||
if (!db.objectStoreNames.contains(STORE_NAME)) {
|
|
||||||
db.createObjectStore(STORE_NAME);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
});
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.warn('IndexedDB read failed:', error);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function getCachedFontFromDB(fontFamily: string): Promise<ArrayBuffer | null> {
|
async function saveFontToDB(
|
||||||
try {
|
fontFamily: string,
|
||||||
const db = await openFontDB();
|
fontBuffer: ArrayBuffer
|
||||||
return new Promise((resolve, reject) => {
|
): Promise<void> {
|
||||||
const transaction = db.transaction(STORE_NAME, 'readonly');
|
try {
|
||||||
const store = transaction.objectStore(STORE_NAME);
|
const db = await openFontDB();
|
||||||
const request = store.get(fontFamily);
|
return new Promise((resolve, reject) => {
|
||||||
|
const transaction = db.transaction(STORE_NAME, 'readwrite');
|
||||||
|
const store = transaction.objectStore(STORE_NAME);
|
||||||
|
const request = store.put(fontBuffer, fontFamily);
|
||||||
|
|
||||||
request.onsuccess = () => resolve(request.result || null);
|
request.onsuccess = () => resolve();
|
||||||
request.onerror = () => reject(request.error);
|
request.onerror = () => reject(request.error);
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.warn('IndexedDB read failed:', error);
|
console.warn('IndexedDB write failed:', error);
|
||||||
return null;
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function saveFontToDB(fontFamily: string, fontBuffer: ArrayBuffer): Promise<void> {
|
|
||||||
try {
|
|
||||||
const db = await openFontDB();
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
const transaction = db.transaction(STORE_NAME, 'readwrite');
|
|
||||||
const store = transaction.objectStore(STORE_NAME);
|
|
||||||
const request = store.put(fontBuffer, fontFamily);
|
|
||||||
|
|
||||||
request.onsuccess = () => resolve();
|
|
||||||
request.onerror = () => reject(request.error);
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
console.warn('IndexedDB write failed:', error);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getFontForLanguage(lang: string): Promise<ArrayBuffer> {
|
export async function getFontForLanguage(lang: string): Promise<ArrayBuffer> {
|
||||||
const fontFamily = languageToFontFamily[lang] || 'Noto Sans';
|
const fontFamily = languageToFontFamily[lang] || 'Noto Sans';
|
||||||
|
|
||||||
if (fontCache.has(fontFamily)) {
|
if (fontCache.has(fontFamily)) {
|
||||||
return fontCache.get(fontFamily)!;
|
return fontCache.get(fontFamily)!;
|
||||||
}
|
}
|
||||||
const cachedFont = await getCachedFontFromDB(fontFamily);
|
const cachedFont = await getCachedFontFromDB(fontFamily);
|
||||||
if (cachedFont) {
|
if (cachedFont) {
|
||||||
fontCache.set(fontFamily, cachedFont);
|
fontCache.set(fontFamily, cachedFont);
|
||||||
return cachedFont;
|
return cachedFont;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const fontUrl = resolveFontUrl(fontFamily);
|
||||||
|
|
||||||
|
const fontResponse = await fetch(fontUrl);
|
||||||
|
|
||||||
|
if (!fontResponse.ok) {
|
||||||
|
throw new Error(`Failed to fetch font file: ${fontResponse.statusText}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
const fontBuffer = await fontResponse.arrayBuffer();
|
||||||
const fontUrl = fontFamilyToUrl[fontFamily] || fontFamilyToUrl['Noto Sans'];
|
|
||||||
|
|
||||||
const fontResponse = await fetch(fontUrl);
|
fontCache.set(fontFamily, fontBuffer);
|
||||||
|
await saveFontToDB(fontFamily, fontBuffer);
|
||||||
|
|
||||||
if (!fontResponse.ok) {
|
return fontBuffer;
|
||||||
throw new Error(`Failed to fetch font file: ${fontResponse.statusText}`);
|
} catch (error) {
|
||||||
}
|
console.warn(
|
||||||
|
`Failed to fetch font for ${lang} (${fontFamily}), falling back to default.`,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
|
||||||
const fontBuffer = await fontResponse.arrayBuffer();
|
if (fontFamily !== 'Noto Sans') {
|
||||||
|
return await getFontForLanguage('eng');
|
||||||
fontCache.set(fontFamily, fontBuffer);
|
|
||||||
await saveFontToDB(fontFamily, fontBuffer);
|
|
||||||
|
|
||||||
return fontBuffer;
|
|
||||||
} catch (error) {
|
|
||||||
console.warn(`Failed to fetch font for ${lang} (${fontFamily}), falling back to default.`, error);
|
|
||||||
|
|
||||||
if (fontFamily !== 'Noto Sans') {
|
|
||||||
return await getFontForLanguage('eng');
|
|
||||||
}
|
|
||||||
|
|
||||||
throw error;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function detectScripts(text: string): string[] {
|
export function detectScripts(text: string): string[] {
|
||||||
const scripts = new Set<string>();
|
const scripts = new Set<string>();
|
||||||
|
|
||||||
// Japanese: Hiragana (\u3040-\u309F) & Katakana (\u30A0-\u30FF)
|
// Japanese: Hiragana (\u3040-\u309F) & Katakana (\u30A0-\u30FF)
|
||||||
if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) {
|
if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) {
|
||||||
scripts.add('jpn');
|
scripts.add('jpn');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Korean: Hangul Syllables (\uAC00-\uD7A3) & Jamo (\u1100-\u11FF)
|
// Korean: Hangul Syllables (\uAC00-\uD7A3) & Jamo (\u1100-\u11FF)
|
||||||
if (/[\uAC00-\uD7A3\u1100-\u11FF]/.test(text)) {
|
if (/[\uAC00-\uD7A3\u1100-\u11FF]/.test(text)) {
|
||||||
scripts.add('kor');
|
scripts.add('kor');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Chinese: CJK Unified Ideographs (\u4E00-\u9FFF) & Ext A (\u3400-\u4DBF)
|
// Chinese: CJK Unified Ideographs (\u4E00-\u9FFF) & Ext A (\u3400-\u4DBF)
|
||||||
if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) {
|
if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) {
|
||||||
scripts.add('chi_sim');
|
scripts.add('chi_sim');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for Arabic
|
// Check for Arabic
|
||||||
if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(text)) {
|
if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(text)) {
|
||||||
scripts.add('ara');
|
scripts.add('ara');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for Devanagari (Hindi, Marathi, etc.)
|
// Check for Devanagari (Hindi, Marathi, etc.)
|
||||||
if (/[\u0900-\u097F]/.test(text)) scripts.add('hin');
|
if (/[\u0900-\u097F]/.test(text)) scripts.add('hin');
|
||||||
|
|
||||||
// Check for Bengali
|
// Check for Bengali
|
||||||
if (/[\u0980-\u09FF]/.test(text)) scripts.add('ben');
|
if (/[\u0980-\u09FF]/.test(text)) scripts.add('ben');
|
||||||
|
|
||||||
// Check for Tamil
|
// Check for Tamil
|
||||||
if (/[\u0B80-\u0BFF]/.test(text)) scripts.add('tam');
|
if (/[\u0B80-\u0BFF]/.test(text)) scripts.add('tam');
|
||||||
|
|
||||||
// Check for Telugu
|
// Check for Telugu
|
||||||
if (/[\u0C00-\u0C7F]/.test(text)) scripts.add('tel');
|
if (/[\u0C00-\u0C7F]/.test(text)) scripts.add('tel');
|
||||||
|
|
||||||
// Check for Kannada
|
// Check for Kannada
|
||||||
if (/[\u0C80-\u0CFF]/.test(text)) scripts.add('kan');
|
if (/[\u0C80-\u0CFF]/.test(text)) scripts.add('kan');
|
||||||
|
|
||||||
// Check for Malayalam
|
// Check for Malayalam
|
||||||
if (/[\u0D00-\u0D7F]/.test(text)) scripts.add('mal');
|
if (/[\u0D00-\u0D7F]/.test(text)) scripts.add('mal');
|
||||||
|
|
||||||
// Check for Gujarati
|
// Check for Gujarati
|
||||||
if (/[\u0A80-\u0AFF]/.test(text)) scripts.add('guj');
|
if (/[\u0A80-\u0AFF]/.test(text)) scripts.add('guj');
|
||||||
|
|
||||||
// Check for Punjabi (Gurmukhi)
|
// Check for Punjabi (Gurmukhi)
|
||||||
if (/[\u0A00-\u0A7F]/.test(text)) scripts.add('pan');
|
if (/[\u0A00-\u0A7F]/.test(text)) scripts.add('pan');
|
||||||
|
|
||||||
// Check for Oriya
|
// Check for Oriya
|
||||||
if (/[\u0B00-\u0B7F]/.test(text)) scripts.add('ori');
|
if (/[\u0B00-\u0B7F]/.test(text)) scripts.add('ori');
|
||||||
|
|
||||||
// Check for Sinhala
|
// Check for Sinhala
|
||||||
if (/[\u0D80-\u0DFF]/.test(text)) scripts.add('sin');
|
if (/[\u0D80-\u0DFF]/.test(text)) scripts.add('sin');
|
||||||
|
|
||||||
// Check for Thai
|
// Check for Thai
|
||||||
if (/[\u0E00-\u0E7F]/.test(text)) scripts.add('tha');
|
if (/[\u0E00-\u0E7F]/.test(text)) scripts.add('tha');
|
||||||
|
|
||||||
// Check for Lao
|
// Check for Lao
|
||||||
if (/[\u0E80-\u0EFF]/.test(text)) scripts.add('lao');
|
if (/[\u0E80-\u0EFF]/.test(text)) scripts.add('lao');
|
||||||
|
|
||||||
// Check for Khmer
|
// Check for Khmer
|
||||||
if (/[\u1780-\u17FF]/.test(text)) scripts.add('khm');
|
if (/[\u1780-\u17FF]/.test(text)) scripts.add('khm');
|
||||||
|
|
||||||
// Check for Myanmar
|
// Check for Myanmar
|
||||||
if (/[\u1000-\u109F]/.test(text)) scripts.add('mya');
|
if (/[\u1000-\u109F]/.test(text)) scripts.add('mya');
|
||||||
|
|
||||||
// Check for Tibetan
|
// Check for Tibetan
|
||||||
if (/[\u0F00-\u0FFF]/.test(text)) scripts.add('bod');
|
if (/[\u0F00-\u0FFF]/.test(text)) scripts.add('bod');
|
||||||
|
|
||||||
// Check for Georgian
|
// Check for Georgian
|
||||||
if (/[\u10A0-\u10FF]/.test(text)) scripts.add('kat');
|
if (/[\u10A0-\u10FF]/.test(text)) scripts.add('kat');
|
||||||
|
|
||||||
// Check for Armenian
|
// Check for Armenian
|
||||||
if (/[\u0530-\u058F]/.test(text)) scripts.add('hye');
|
if (/[\u0530-\u058F]/.test(text)) scripts.add('hye');
|
||||||
|
|
||||||
// Check for Hebrew
|
// Check for Hebrew
|
||||||
if (/[\u0590-\u05FF]/.test(text)) scripts.add('heb');
|
if (/[\u0590-\u05FF]/.test(text)) scripts.add('heb');
|
||||||
|
|
||||||
// Check for Ethiopic
|
// Check for Ethiopic
|
||||||
if (/[\u1200-\u137F]/.test(text)) scripts.add('amh');
|
if (/[\u1200-\u137F]/.test(text)) scripts.add('amh');
|
||||||
|
|
||||||
// Check for Cherokee
|
// Check for Cherokee
|
||||||
if (/[\u13A0-\u13FF]/.test(text)) scripts.add('chr');
|
if (/[\u13A0-\u13FF]/.test(text)) scripts.add('chr');
|
||||||
|
|
||||||
// Check for Syriac
|
// Check for Syriac
|
||||||
if (/[\u0700-\u074F]/.test(text)) scripts.add('syr');
|
if (/[\u0700-\u074F]/.test(text)) scripts.add('syr');
|
||||||
|
|
||||||
if (scripts.size === 0 || /[a-zA-Z]/.test(text)) {
|
if (scripts.size === 0 || /[a-zA-Z]/.test(text)) {
|
||||||
scripts.add('eng');
|
scripts.add('eng');
|
||||||
}
|
}
|
||||||
|
|
||||||
return Array.from(scripts);
|
return Array.from(scripts);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function getLanguageForChar(char: string): string {
|
export function getLanguageForChar(char: string): string {
|
||||||
const code = char.charCodeAt(0);
|
const code = char.charCodeAt(0);
|
||||||
|
|
||||||
// Latin (Basic + Supplement + Extended)
|
// Latin (Basic + Supplement + Extended)
|
||||||
if (code <= 0x024F) return 'eng';
|
if (code <= 0x024f) return 'eng';
|
||||||
|
|
||||||
// Japanese: Hiragana & Katakana
|
// Japanese: Hiragana & Katakana
|
||||||
if (
|
if (
|
||||||
(code >= 0x3040 && code <= 0x309F) || // Hiragana
|
(code >= 0x3040 && code <= 0x309f) || // Hiragana
|
||||||
(code >= 0x30A0 && code <= 0x30FF) // Katakana
|
(code >= 0x30a0 && code <= 0x30ff) // Katakana
|
||||||
) return 'jpn';
|
)
|
||||||
|
return 'jpn';
|
||||||
|
|
||||||
// Korean: Hangul Syllables & Jamo
|
// Korean: Hangul Syllables & Jamo
|
||||||
if (
|
if (
|
||||||
(code >= 0xAC00 && code <= 0xD7A3) || // Hangul Syllables
|
(code >= 0xac00 && code <= 0xd7a3) || // Hangul Syllables
|
||||||
(code >= 0x1100 && code <= 0x11FF) // Hangul Jamo
|
(code >= 0x1100 && code <= 0x11ff) // Hangul Jamo
|
||||||
) return 'kor';
|
)
|
||||||
|
return 'kor';
|
||||||
|
|
||||||
// Chinese: CJK Unified Ideographs (Han)
|
// Chinese: CJK Unified Ideographs (Han)
|
||||||
if (
|
if (
|
||||||
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified
|
(code >= 0x4e00 && code <= 0x9fff) || // CJK Unified
|
||||||
(code >= 0x3400 && code <= 0x4DBF) // CJK Ext A
|
(code >= 0x3400 && code <= 0x4dbf) // CJK Ext A
|
||||||
) return 'chi_sim';
|
)
|
||||||
|
return 'chi_sim';
|
||||||
|
|
||||||
// Arabic
|
// Arabic
|
||||||
if ((code >= 0x0600 && code <= 0x06FF) || (code >= 0x0750 && code <= 0x077F) || (code >= 0x08A0 && code <= 0x08FF)) return 'ara';
|
if (
|
||||||
|
(code >= 0x0600 && code <= 0x06ff) ||
|
||||||
|
(code >= 0x0750 && code <= 0x077f) ||
|
||||||
|
(code >= 0x08a0 && code <= 0x08ff)
|
||||||
|
)
|
||||||
|
return 'ara';
|
||||||
|
|
||||||
// Devanagari
|
// Devanagari
|
||||||
if (code >= 0x0900 && code <= 0x097F) return 'hin';
|
if (code >= 0x0900 && code <= 0x097f) return 'hin';
|
||||||
|
|
||||||
// Bengali
|
// Bengali
|
||||||
if (code >= 0x0980 && code <= 0x09FF) return 'ben';
|
if (code >= 0x0980 && code <= 0x09ff) return 'ben';
|
||||||
|
|
||||||
// Tamil
|
// Tamil
|
||||||
if (code >= 0x0B80 && code <= 0x0BFF) return 'tam';
|
if (code >= 0x0b80 && code <= 0x0bff) return 'tam';
|
||||||
|
|
||||||
// Telugu
|
// Telugu
|
||||||
if (code >= 0x0C00 && code <= 0x0C7F) return 'tel';
|
if (code >= 0x0c00 && code <= 0x0c7f) return 'tel';
|
||||||
|
|
||||||
// Kannada
|
// Kannada
|
||||||
if (code >= 0x0C80 && code <= 0x0CFF) return 'kan';
|
if (code >= 0x0c80 && code <= 0x0cff) return 'kan';
|
||||||
|
|
||||||
// Malayalam
|
// Malayalam
|
||||||
if (code >= 0x0D00 && code <= 0x0D7F) return 'mal';
|
if (code >= 0x0d00 && code <= 0x0d7f) return 'mal';
|
||||||
|
|
||||||
// Gujarati
|
// Gujarati
|
||||||
if (code >= 0x0A80 && code <= 0x0AFF) return 'guj';
|
if (code >= 0x0a80 && code <= 0x0aff) return 'guj';
|
||||||
|
|
||||||
// Punjabi (Gurmukhi)
|
// Punjabi (Gurmukhi)
|
||||||
if (code >= 0x0A00 && code <= 0x0A7F) return 'pan';
|
if (code >= 0x0a00 && code <= 0x0a7f) return 'pan';
|
||||||
|
|
||||||
// Oriya
|
// Oriya
|
||||||
if (code >= 0x0B00 && code <= 0x0B7F) return 'ori';
|
if (code >= 0x0b00 && code <= 0x0b7f) return 'ori';
|
||||||
|
|
||||||
// Sinhala
|
// Sinhala
|
||||||
if (code >= 0x0D80 && code <= 0x0DFF) return 'sin';
|
if (code >= 0x0d80 && code <= 0x0dff) return 'sin';
|
||||||
|
|
||||||
// Thai
|
// Thai
|
||||||
if (code >= 0x0E00 && code <= 0x0E7F) return 'tha';
|
if (code >= 0x0e00 && code <= 0x0e7f) return 'tha';
|
||||||
|
|
||||||
// Lao
|
// Lao
|
||||||
if (code >= 0x0E80 && code <= 0x0EFF) return 'lao';
|
if (code >= 0x0e80 && code <= 0x0eff) return 'lao';
|
||||||
|
|
||||||
// Khmer
|
// Khmer
|
||||||
if (code >= 0x1780 && code <= 0x17FF) return 'khm';
|
if (code >= 0x1780 && code <= 0x17ff) return 'khm';
|
||||||
|
|
||||||
// Myanmar
|
// Myanmar
|
||||||
if (code >= 0x1000 && code <= 0x109F) return 'mya';
|
if (code >= 0x1000 && code <= 0x109f) return 'mya';
|
||||||
|
|
||||||
// Tibetan
|
// Tibetan
|
||||||
if (code >= 0x0F00 && code <= 0x0FFF) return 'bod';
|
if (code >= 0x0f00 && code <= 0x0fff) return 'bod';
|
||||||
|
|
||||||
// Georgian
|
// Georgian
|
||||||
if (code >= 0x10A0 && code <= 0x10FF) return 'kat';
|
if (code >= 0x10a0 && code <= 0x10ff) return 'kat';
|
||||||
|
|
||||||
// Armenian
|
// Armenian
|
||||||
if (code >= 0x0530 && code <= 0x058F) return 'hye';
|
if (code >= 0x0530 && code <= 0x058f) return 'hye';
|
||||||
|
|
||||||
// Hebrew
|
// Hebrew
|
||||||
if (code >= 0x0590 && code <= 0x05FF) return 'heb';
|
if (code >= 0x0590 && code <= 0x05ff) return 'heb';
|
||||||
|
|
||||||
// Ethiopic
|
// Ethiopic
|
||||||
if (code >= 0x1200 && code <= 0x137F) return 'amh';
|
if (code >= 0x1200 && code <= 0x137f) return 'amh';
|
||||||
|
|
||||||
// Cherokee
|
// Cherokee
|
||||||
if (code >= 0x13A0 && code <= 0x13FF) return 'chr';
|
if (code >= 0x13a0 && code <= 0x13ff) return 'chr';
|
||||||
|
|
||||||
// Syriac
|
// Syriac
|
||||||
if (code >= 0x0700 && code <= 0x074F) return 'syr';
|
if (code >= 0x0700 && code <= 0x074f) return 'syr';
|
||||||
|
|
||||||
// Default to English (Latin)
|
// Default to English (Latin)
|
||||||
return 'eng';
|
return 'eng';
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
import Tesseract from 'tesseract.js';
|
import Tesseract from 'tesseract.js';
|
||||||
import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib';
|
import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib';
|
||||||
import fontkit from '@pdf-lib/fontkit';
|
import fontkit from '@pdf-lib/fontkit';
|
||||||
import * as pdfjsLib from 'pdfjs-dist';
|
|
||||||
import { getFontForLanguage } from './font-loader.js';
|
import { getFontForLanguage } from './font-loader.js';
|
||||||
import { OcrPage, OcrLine } from '@/types';
|
import { OcrPage, OcrLine } from '@/types';
|
||||||
import {
|
import {
|
||||||
@@ -10,6 +9,7 @@ import {
|
|||||||
calculateSpaceTransform,
|
calculateSpaceTransform,
|
||||||
} from './hocr-transform.js';
|
} from './hocr-transform.js';
|
||||||
import { getPDFDocument } from './helpers.js';
|
import { getPDFDocument } from './helpers.js';
|
||||||
|
import { createConfiguredTesseractWorker } from './tesseract-runtime.js';
|
||||||
|
|
||||||
export interface OcrOptions {
|
export interface OcrOptions {
|
||||||
language: string;
|
language: string;
|
||||||
@@ -134,11 +134,13 @@ export async function performOcr(
|
|||||||
const { language, resolution, binarize, whitelist, onProgress } = options;
|
const { language, resolution, binarize, whitelist, onProgress } = options;
|
||||||
const progress = onProgress || (() => {});
|
const progress = onProgress || (() => {});
|
||||||
|
|
||||||
const worker = await Tesseract.createWorker(language, 1, {
|
const worker = await createConfiguredTesseractWorker(
|
||||||
logger: function (m: { status: string; progress: number }) {
|
language,
|
||||||
|
1,
|
||||||
|
function (m: { status: string; progress: number }) {
|
||||||
progress(m.status, m.progress || 0);
|
progress(m.status, m.progress || 0);
|
||||||
},
|
}
|
||||||
});
|
);
|
||||||
|
|
||||||
await worker.setParameters({
|
await worker.setParameters({
|
||||||
tessjs_create_hocr: '1',
|
tessjs_create_hocr: '1',
|
||||||
|
|||||||
132
src/js/utils/tesseract-language-availability.ts
Normal file
132
src/js/utils/tesseract-language-availability.ts
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
import { tesseractLanguages } from '../config/tesseract-languages.js';
|
||||||
|
|
||||||
|
export const TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY =
|
||||||
|
'VITE_TESSERACT_AVAILABLE_LANGUAGES' as const;
|
||||||
|
|
||||||
|
type TesseractAvailabilityEnv = Partial<
|
||||||
|
Pick<ImportMetaEnv, typeof TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY>
|
||||||
|
>;
|
||||||
|
|
||||||
|
export type TesseractLanguageCode = keyof typeof tesseractLanguages;
|
||||||
|
|
||||||
|
function getDefaultEnv(): TesseractAvailabilityEnv {
|
||||||
|
return import.meta.env;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeLanguageCodes(value: string | string[]): string[] {
|
||||||
|
const rawCodes = Array.isArray(value) ? value : value.split(/[+,]/);
|
||||||
|
const seen = new Set<string>();
|
||||||
|
const normalizedCodes: string[] = [];
|
||||||
|
|
||||||
|
for (const rawCode of rawCodes) {
|
||||||
|
const code = rawCode.trim();
|
||||||
|
if (!code || seen.has(code)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen.add(code);
|
||||||
|
normalizedCodes.push(code);
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalizedCodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatLanguageLabel(code: string): string {
|
||||||
|
const label = tesseractLanguages[code as TesseractLanguageCode];
|
||||||
|
return label ? `${label} (${code})` : code;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveConfiguredTesseractAvailableLanguages(
|
||||||
|
env: TesseractAvailabilityEnv = getDefaultEnv()
|
||||||
|
): string[] | null {
|
||||||
|
const configuredLanguages = env.VITE_TESSERACT_AVAILABLE_LANGUAGES?.trim();
|
||||||
|
if (!configuredLanguages) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalizeLanguageCodes(configuredLanguages);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getAvailableTesseractLanguageEntries(
|
||||||
|
env: TesseractAvailabilityEnv = getDefaultEnv()
|
||||||
|
): Array<[TesseractLanguageCode, string]> {
|
||||||
|
const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(env);
|
||||||
|
const allEntries = Object.entries(tesseractLanguages) as Array<
|
||||||
|
[TesseractLanguageCode, string]
|
||||||
|
>;
|
||||||
|
|
||||||
|
if (!configuredLanguages) {
|
||||||
|
return allEntries;
|
||||||
|
}
|
||||||
|
|
||||||
|
const configuredSet = new Set(configuredLanguages);
|
||||||
|
return allEntries.filter(([code]) => configuredSet.has(code));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getUnavailableTesseractLanguages(
|
||||||
|
requestedLanguages: string | string[],
|
||||||
|
env: TesseractAvailabilityEnv = getDefaultEnv()
|
||||||
|
): string[] {
|
||||||
|
const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(env);
|
||||||
|
if (!configuredLanguages) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const configuredSet = new Set(configuredLanguages);
|
||||||
|
return normalizeLanguageCodes(requestedLanguages).filter(
|
||||||
|
(code) => !configuredSet.has(code)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function formatTesseractLanguageList(codes: string[]): string {
|
||||||
|
return codes.map(formatLanguageLabel).join(', ');
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildUnsupportedLanguageMessage(
|
||||||
|
unavailableLanguages: string[],
|
||||||
|
availableLanguages: string[]
|
||||||
|
): string {
|
||||||
|
const unavailableText = formatTesseractLanguageList(unavailableLanguages);
|
||||||
|
const availableText = formatTesseractLanguageList(availableLanguages);
|
||||||
|
|
||||||
|
return [
|
||||||
|
`This BentoPDF build only bundles OCR data for ${availableText}.`,
|
||||||
|
`The requested OCR language is not available: ${unavailableText}.`,
|
||||||
|
'Choose one of the bundled languages or rebuild the air-gapped bundle with the missing language added to --ocr-languages.',
|
||||||
|
].join(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
export class UnsupportedOcrLanguageError extends Error {
|
||||||
|
readonly unavailableLanguages: string[];
|
||||||
|
readonly availableLanguages: string[];
|
||||||
|
|
||||||
|
constructor(unavailableLanguages: string[], availableLanguages: string[]) {
|
||||||
|
super(
|
||||||
|
buildUnsupportedLanguageMessage(unavailableLanguages, availableLanguages)
|
||||||
|
);
|
||||||
|
this.name = 'UnsupportedOcrLanguageError';
|
||||||
|
this.unavailableLanguages = unavailableLanguages;
|
||||||
|
this.availableLanguages = availableLanguages;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function assertTesseractLanguagesAvailable(
|
||||||
|
requestedLanguages: string | string[],
|
||||||
|
env: TesseractAvailabilityEnv = getDefaultEnv()
|
||||||
|
): void {
|
||||||
|
const availableLanguages = resolveConfiguredTesseractAvailableLanguages(env);
|
||||||
|
if (!availableLanguages) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const unavailableLanguages = getUnavailableTesseractLanguages(
|
||||||
|
requestedLanguages,
|
||||||
|
env
|
||||||
|
);
|
||||||
|
|
||||||
|
if (unavailableLanguages.length > 0) {
|
||||||
|
throw new UnsupportedOcrLanguageError(
|
||||||
|
unavailableLanguages,
|
||||||
|
availableLanguages
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
130
src/js/utils/tesseract-runtime.ts
Normal file
130
src/js/utils/tesseract-runtime.ts
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
import Tesseract from 'tesseract.js';
|
||||||
|
import {
|
||||||
|
assertTesseractLanguagesAvailable,
|
||||||
|
TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY,
|
||||||
|
} from './tesseract-language-availability.js';
|
||||||
|
|
||||||
|
const TESSERACT_ENV_KEYS = [
|
||||||
|
'VITE_TESSERACT_WORKER_URL',
|
||||||
|
'VITE_TESSERACT_CORE_URL',
|
||||||
|
'VITE_TESSERACT_LANG_URL',
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
const TESSERACT_RUNTIME_ENV_KEYS = [
|
||||||
|
...TESSERACT_ENV_KEYS,
|
||||||
|
TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY,
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
type TesseractRuntimeEnvKey = (typeof TESSERACT_RUNTIME_ENV_KEYS)[number];
|
||||||
|
|
||||||
|
export type TesseractAssetEnv = Partial<
|
||||||
|
Pick<ImportMetaEnv, TesseractRuntimeEnvKey>
|
||||||
|
>;
|
||||||
|
|
||||||
|
export interface TesseractAssetConfig {
|
||||||
|
workerPath?: string;
|
||||||
|
corePath?: string;
|
||||||
|
langPath?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export type TesseractLoggerMessage = Tesseract.LoggerMessage;
|
||||||
|
export type TesseractWorkerOptions = Partial<Tesseract.WorkerOptions>;
|
||||||
|
export type TesseractWorker = Tesseract.Worker;
|
||||||
|
|
||||||
|
function getDefaultTesseractAssetEnv(): TesseractAssetEnv {
|
||||||
|
return import.meta.env;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeDirectoryUrl(url?: string): string | undefined {
|
||||||
|
const trimmed = url?.trim();
|
||||||
|
if (!trimmed) return undefined;
|
||||||
|
return trimmed.replace(/\/+$/, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeFileUrl(url?: string): string | undefined {
|
||||||
|
const trimmed = url?.trim();
|
||||||
|
if (!trimmed) return undefined;
|
||||||
|
return trimmed.replace(/\/+$/, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveTesseractAssetConfig(
|
||||||
|
env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
|
||||||
|
): TesseractAssetConfig {
|
||||||
|
return {
|
||||||
|
workerPath: normalizeFileUrl(env.VITE_TESSERACT_WORKER_URL),
|
||||||
|
corePath: normalizeDirectoryUrl(env.VITE_TESSERACT_CORE_URL),
|
||||||
|
langPath: normalizeDirectoryUrl(env.VITE_TESSERACT_LANG_URL),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function hasConfiguredTesseractOverrides(
|
||||||
|
config: TesseractAssetConfig = resolveTesseractAssetConfig()
|
||||||
|
): boolean {
|
||||||
|
return Boolean(config.workerPath || config.corePath || config.langPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function hasCompleteTesseractOverrides(
|
||||||
|
config: TesseractAssetConfig = resolveTesseractAssetConfig()
|
||||||
|
): boolean {
|
||||||
|
return Boolean(config.workerPath && config.corePath && config.langPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getIncompleteTesseractOverrideKeys(
|
||||||
|
config: TesseractAssetConfig = resolveTesseractAssetConfig()
|
||||||
|
): Array<(typeof TESSERACT_ENV_KEYS)[number]> {
|
||||||
|
if (!hasConfiguredTesseractOverrides(config)) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return TESSERACT_ENV_KEYS.filter((key) => {
|
||||||
|
switch (key) {
|
||||||
|
case 'VITE_TESSERACT_WORKER_URL':
|
||||||
|
return !config.workerPath;
|
||||||
|
case 'VITE_TESSERACT_CORE_URL':
|
||||||
|
return !config.corePath;
|
||||||
|
case 'VITE_TESSERACT_LANG_URL':
|
||||||
|
return !config.langPath;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function buildTesseractWorkerOptions(
|
||||||
|
logger?: TesseractWorkerOptions['logger'],
|
||||||
|
env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
|
||||||
|
): TesseractWorkerOptions {
|
||||||
|
const config = resolveTesseractAssetConfig(env);
|
||||||
|
|
||||||
|
if (!hasConfiguredTesseractOverrides(config)) {
|
||||||
|
return logger ? { logger } : {};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!hasCompleteTesseractOverrides(config)) {
|
||||||
|
const missing = getIncompleteTesseractOverrideKeys(config).join(', ');
|
||||||
|
throw new Error(
|
||||||
|
`Self-hosted OCR assets are partially configured. Set ${missing} together with the other Tesseract asset URLs.`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
...(logger ? { logger } : {}),
|
||||||
|
workerPath: config.workerPath,
|
||||||
|
corePath: config.corePath,
|
||||||
|
langPath: config.langPath,
|
||||||
|
gzip: true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function createConfiguredTesseractWorker(
|
||||||
|
language: string,
|
||||||
|
oem: Tesseract.OEM,
|
||||||
|
logger?: TesseractWorkerOptions['logger'],
|
||||||
|
env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
|
||||||
|
): Promise<TesseractWorker> {
|
||||||
|
assertTesseractLanguagesAvailable(language, env);
|
||||||
|
|
||||||
|
return Tesseract.createWorker(
|
||||||
|
language,
|
||||||
|
oem,
|
||||||
|
buildTesseractWorkerOptions(logger, env)
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -214,6 +214,10 @@
|
|||||||
>None</span
|
>None</span
|
||||||
>
|
>
|
||||||
</p>
|
</p>
|
||||||
|
<p
|
||||||
|
id="lang-availability-note"
|
||||||
|
class="hidden text-xs text-amber-300 mt-2"
|
||||||
|
></p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Advanced settings -->
|
<!-- Advanced settings -->
|
||||||
|
|||||||
81
src/tests/compare/ocr-page.test.ts
Normal file
81
src/tests/compare/ocr-page.test.ts
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||||
|
|
||||||
|
const { createConfiguredTesseractWorker } = vi.hoisted(() => ({
|
||||||
|
createConfiguredTesseractWorker: vi.fn(),
|
||||||
|
}));
|
||||||
|
|
||||||
|
const mockWorker = {
|
||||||
|
recognize: vi.fn(),
|
||||||
|
terminate: vi.fn(),
|
||||||
|
};
|
||||||
|
|
||||||
|
vi.mock('../../js/utils/tesseract-runtime', () => ({
|
||||||
|
createConfiguredTesseractWorker,
|
||||||
|
}));
|
||||||
|
|
||||||
|
import { recognizePageCanvas } from '../../js/compare/engine/ocr-page';
|
||||||
|
|
||||||
|
describe('compare OCR page recognition', () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
createConfiguredTesseractWorker.mockReset();
|
||||||
|
mockWorker.recognize.mockReset();
|
||||||
|
mockWorker.terminate.mockReset();
|
||||||
|
createConfiguredTesseractWorker.mockResolvedValue(mockWorker);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('uses the configured Tesseract worker and maps OCR words into compare text items', async () => {
|
||||||
|
const progress = vi.fn();
|
||||||
|
const canvas = {
|
||||||
|
width: 300,
|
||||||
|
height: 150,
|
||||||
|
} as HTMLCanvasElement;
|
||||||
|
|
||||||
|
mockWorker.recognize.mockResolvedValue({
|
||||||
|
data: {
|
||||||
|
words: [
|
||||||
|
{
|
||||||
|
text: 'Hello',
|
||||||
|
bbox: { x0: 10, y0: 20, x1: 60, y1: 40 },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
text: 'world',
|
||||||
|
bbox: { x0: 70, y0: 20, x1: 120, y1: 40 },
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const model = await recognizePageCanvas(canvas, 'eng', progress);
|
||||||
|
|
||||||
|
expect(createConfiguredTesseractWorker).toHaveBeenCalledWith(
|
||||||
|
'eng',
|
||||||
|
1,
|
||||||
|
expect.any(Function)
|
||||||
|
);
|
||||||
|
expect(mockWorker.recognize).toHaveBeenCalledWith(canvas);
|
||||||
|
expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
|
||||||
|
expect(model.source).toBe('ocr');
|
||||||
|
expect(model.hasText).toBe(true);
|
||||||
|
expect(model.plainText).toContain('Hello');
|
||||||
|
expect(model.textItems).toHaveLength(1);
|
||||||
|
|
||||||
|
const logger = createConfiguredTesseractWorker.mock
|
||||||
|
.calls[0][2] as (message: { status: string; progress: number }) => void;
|
||||||
|
logger({ status: 'recognizing text', progress: 0.5 });
|
||||||
|
expect(progress).toHaveBeenCalledWith('recognizing text', 0.5);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('terminates the worker when compare OCR fails', async () => {
|
||||||
|
const canvas = {
|
||||||
|
width: 300,
|
||||||
|
height: 150,
|
||||||
|
} as HTMLCanvasElement;
|
||||||
|
mockWorker.recognize.mockRejectedValueOnce(new Error('compare ocr failed'));
|
||||||
|
|
||||||
|
await expect(recognizePageCanvas(canvas, 'eng')).rejects.toThrow(
|
||||||
|
'compare ocr failed'
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
28
src/tests/font-loader.test.ts
Normal file
28
src/tests/font-loader.test.ts
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
import { describe, expect, it } from 'vitest';
|
||||||
|
|
||||||
|
import { getFontAssetFileName } from '../js/config/font-mappings';
|
||||||
|
import { resolveFontUrl } from '../js/utils/font-loader';
|
||||||
|
|
||||||
|
describe('font-loader', () => {
|
||||||
|
it('uses the default public font URL when no offline font base URL is configured', () => {
|
||||||
|
expect(resolveFontUrl('Noto Sans', {})).toBe(
|
||||||
|
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('builds a self-hosted font URL when an OCR font base URL is configured', () => {
|
||||||
|
expect(
|
||||||
|
resolveFontUrl('Noto Sans Arabic', {
|
||||||
|
VITE_OCR_FONT_BASE_URL: 'https://internal.example.com/wasm/ocr/fonts/',
|
||||||
|
})
|
||||||
|
).toBe(
|
||||||
|
'https://internal.example.com/wasm/ocr/fonts/NotoSansArabic-Regular.ttf'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('derives the bundled font asset file name from the default font URL', () => {
|
||||||
|
expect(getFontAssetFileName('Noto Sans SC')).toBe(
|
||||||
|
'NotoSansCJKsc-Regular.otf'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
185
src/tests/ocr.test.ts
Normal file
185
src/tests/ocr.test.ts
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||||
|
|
||||||
|
const {
|
||||||
|
createConfiguredTesseractWorker,
|
||||||
|
getPDFDocument,
|
||||||
|
getFontForLanguage,
|
||||||
|
parseHocrDocument,
|
||||||
|
} = vi.hoisted(() => ({
|
||||||
|
createConfiguredTesseractWorker: vi.fn(),
|
||||||
|
getPDFDocument: vi.fn(),
|
||||||
|
getFontForLanguage: vi.fn(),
|
||||||
|
parseHocrDocument: vi.fn(),
|
||||||
|
}));
|
||||||
|
|
||||||
|
const mockWorker = {
|
||||||
|
setParameters: vi.fn(),
|
||||||
|
recognize: vi.fn(),
|
||||||
|
terminate: vi.fn(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockPdfPage = {
|
||||||
|
getViewport: vi.fn(() => ({ width: 200, height: 100 })),
|
||||||
|
render: vi.fn(() => ({ promise: Promise.resolve() })),
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockPdfOutputPage = {
|
||||||
|
drawImage: vi.fn(),
|
||||||
|
drawText: vi.fn(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockPdfDoc = {
|
||||||
|
registerFontkit: vi.fn(),
|
||||||
|
embedFont: vi.fn(async () => ({ widthOfTextAtSize: vi.fn(() => 12) })),
|
||||||
|
addPage: vi.fn(() => mockPdfOutputPage),
|
||||||
|
embedPng: vi.fn(async () => ({ id: 'png' })),
|
||||||
|
save: vi.fn(async () => new Uint8Array([1, 2, 3])),
|
||||||
|
};
|
||||||
|
|
||||||
|
vi.mock('../js/utils/tesseract-runtime', () => ({
|
||||||
|
createConfiguredTesseractWorker,
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock('../js/utils/helpers.js', () => ({
|
||||||
|
getPDFDocument,
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock('../js/utils/font-loader.js', () => ({
|
||||||
|
getFontForLanguage,
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock('../js/utils/hocr-transform.js', () => ({
|
||||||
|
parseHocrDocument,
|
||||||
|
calculateWordTransform: vi.fn(),
|
||||||
|
calculateSpaceTransform: vi.fn(),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock('pdf-lib', () => ({
|
||||||
|
PDFDocument: {
|
||||||
|
create: vi.fn(async () => mockPdfDoc),
|
||||||
|
},
|
||||||
|
StandardFonts: {
|
||||||
|
Helvetica: 'Helvetica',
|
||||||
|
},
|
||||||
|
rgb: vi.fn(() => ({ r: 0, g: 0, b: 0 })),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock('@pdf-lib/fontkit', () => ({
|
||||||
|
default: {},
|
||||||
|
}));
|
||||||
|
|
||||||
|
import { performOcr } from '../js/utils/ocr';
|
||||||
|
|
||||||
|
describe('performOcr', () => {
|
||||||
|
const originalCreateElement = document.createElement.bind(document);
|
||||||
|
const originalFileReader = globalThis.FileReader;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
createConfiguredTesseractWorker.mockReset();
|
||||||
|
getPDFDocument.mockReset();
|
||||||
|
getFontForLanguage.mockReset();
|
||||||
|
parseHocrDocument.mockReset();
|
||||||
|
|
||||||
|
mockWorker.setParameters.mockReset();
|
||||||
|
mockWorker.recognize.mockReset();
|
||||||
|
mockWorker.terminate.mockReset();
|
||||||
|
mockPdfPage.getViewport.mockClear();
|
||||||
|
mockPdfPage.render.mockClear();
|
||||||
|
mockPdfOutputPage.drawImage.mockClear();
|
||||||
|
mockPdfOutputPage.drawText.mockClear();
|
||||||
|
mockPdfDoc.registerFontkit.mockClear();
|
||||||
|
mockPdfDoc.embedFont.mockClear();
|
||||||
|
mockPdfDoc.addPage.mockClear();
|
||||||
|
mockPdfDoc.embedPng.mockClear();
|
||||||
|
mockPdfDoc.save.mockClear();
|
||||||
|
|
||||||
|
createConfiguredTesseractWorker.mockResolvedValue(mockWorker);
|
||||||
|
getPDFDocument.mockReturnValue({
|
||||||
|
promise: Promise.resolve({
|
||||||
|
numPages: 1,
|
||||||
|
getPage: vi.fn(async () => mockPdfPage),
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
getFontForLanguage.mockResolvedValue(new Uint8Array([1, 2, 3]));
|
||||||
|
mockWorker.recognize.mockResolvedValue({
|
||||||
|
data: {
|
||||||
|
text: 'Recognized text',
|
||||||
|
hocr: '',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
document.createElement = ((tagName: string) => {
|
||||||
|
if (tagName !== 'canvas') {
|
||||||
|
return originalCreateElement(tagName);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
width: 0,
|
||||||
|
height: 0,
|
||||||
|
getContext: vi.fn(() => ({
|
||||||
|
canvas: { width: 200, height: 100 },
|
||||||
|
getImageData: vi.fn(() => ({ data: new Uint8ClampedArray(4) })),
|
||||||
|
putImageData: vi.fn(),
|
||||||
|
})),
|
||||||
|
toBlob: vi.fn((callback: (blob: Blob) => void) => {
|
||||||
|
callback(
|
||||||
|
new Blob([new Uint8Array([1, 2, 3])], { type: 'image/png' })
|
||||||
|
);
|
||||||
|
}),
|
||||||
|
} as unknown as HTMLCanvasElement;
|
||||||
|
}) as typeof document.createElement;
|
||||||
|
|
||||||
|
globalThis.FileReader = class {
|
||||||
|
result: ArrayBuffer = new Uint8Array([1, 2, 3]).buffer;
|
||||||
|
onload: null | (() => void) = null;
|
||||||
|
onerror: null | (() => void) = null;
|
||||||
|
|
||||||
|
readAsArrayBuffer() {
|
||||||
|
this.onload?.();
|
||||||
|
}
|
||||||
|
} as unknown as typeof FileReader;
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
document.createElement = originalCreateElement;
|
||||||
|
globalThis.FileReader = originalFileReader;
|
||||||
|
});
|
||||||
|
|
||||||
|
it('uses the configured Tesseract worker and terminates it after OCR completes', async () => {
|
||||||
|
const result = await performOcr(new Uint8Array([1, 2, 3]), {
|
||||||
|
language: 'eng',
|
||||||
|
resolution: 2,
|
||||||
|
binarize: false,
|
||||||
|
whitelist: '',
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(createConfiguredTesseractWorker).toHaveBeenCalledWith(
|
||||||
|
'eng',
|
||||||
|
1,
|
||||||
|
expect.any(Function)
|
||||||
|
);
|
||||||
|
expect(mockWorker.setParameters).toHaveBeenCalledWith({
|
||||||
|
tessjs_create_hocr: '1',
|
||||||
|
tessedit_pageseg_mode: '3',
|
||||||
|
});
|
||||||
|
expect(mockWorker.recognize).toHaveBeenCalledTimes(1);
|
||||||
|
expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
|
||||||
|
expect(result.fullText).toContain('Recognized text');
|
||||||
|
expect(result.pdfBytes).toBeInstanceOf(Uint8Array);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('terminates the Tesseract worker when OCR fails', async () => {
|
||||||
|
mockWorker.recognize.mockRejectedValueOnce(new Error('ocr failed'));
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
performOcr(new Uint8Array([1, 2, 3]), {
|
||||||
|
language: 'eng',
|
||||||
|
resolution: 2,
|
||||||
|
binarize: false,
|
||||||
|
whitelist: '',
|
||||||
|
})
|
||||||
|
).rejects.toThrow('ocr failed');
|
||||||
|
|
||||||
|
expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
128
src/tests/tesseract-runtime.test.ts
Normal file
128
src/tests/tesseract-runtime.test.ts
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||||
|
|
||||||
|
const { createWorker } = vi.hoisted(() => ({
|
||||||
|
createWorker: vi.fn(),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock('tesseract.js', () => ({
|
||||||
|
default: {
|
||||||
|
createWorker,
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
|
import {
|
||||||
|
buildTesseractWorkerOptions,
|
||||||
|
createConfiguredTesseractWorker,
|
||||||
|
getIncompleteTesseractOverrideKeys,
|
||||||
|
hasCompleteTesseractOverrides,
|
||||||
|
hasConfiguredTesseractOverrides,
|
||||||
|
resolveTesseractAssetConfig,
|
||||||
|
} from '../js/utils/tesseract-runtime';
|
||||||
|
import {
|
||||||
|
assertTesseractLanguagesAvailable,
|
||||||
|
getAvailableTesseractLanguageEntries,
|
||||||
|
getUnavailableTesseractLanguages,
|
||||||
|
UnsupportedOcrLanguageError,
|
||||||
|
} from '../js/utils/tesseract-language-availability';
|
||||||
|
|
||||||
|
describe('tesseract-runtime', () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
createWorker.mockReset();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('normalizes self-hosted OCR asset URLs', () => {
|
||||||
|
const config = resolveTesseractAssetConfig({
|
||||||
|
VITE_TESSERACT_WORKER_URL:
|
||||||
|
'https://internal.example.com/ocr/worker.min.js/',
|
||||||
|
VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core/',
|
||||||
|
VITE_TESSERACT_LANG_URL: 'https://internal.example.com/ocr/lang-data/',
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(config).toEqual({
|
||||||
|
workerPath: 'https://internal.example.com/ocr/worker.min.js',
|
||||||
|
corePath: 'https://internal.example.com/ocr/core',
|
||||||
|
langPath: 'https://internal.example.com/ocr/lang-data',
|
||||||
|
});
|
||||||
|
expect(hasConfiguredTesseractOverrides(config)).toBe(true);
|
||||||
|
expect(hasCompleteTesseractOverrides(config)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns logger-only options when no self-hosted OCR assets are configured', () => {
|
||||||
|
const logger = vi.fn();
|
||||||
|
|
||||||
|
expect(buildTesseractWorkerOptions(logger, {})).toEqual({ logger });
|
||||||
|
expect(
|
||||||
|
hasConfiguredTesseractOverrides(resolveTesseractAssetConfig({}))
|
||||||
|
).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('throws on partial OCR asset configuration', () => {
|
||||||
|
const env = {
|
||||||
|
VITE_TESSERACT_WORKER_URL:
|
||||||
|
'https://internal.example.com/ocr/worker.min.js',
|
||||||
|
VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core',
|
||||||
|
};
|
||||||
|
|
||||||
|
expect(
|
||||||
|
getIncompleteTesseractOverrideKeys(resolveTesseractAssetConfig(env))
|
||||||
|
).toEqual(['VITE_TESSERACT_LANG_URL']);
|
||||||
|
expect(() => buildTesseractWorkerOptions(undefined, env)).toThrow(
|
||||||
|
'Self-hosted OCR assets are partially configured'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('passes configured OCR asset URLs to Tesseract.createWorker', async () => {
|
||||||
|
const logger = vi.fn();
|
||||||
|
createWorker.mockResolvedValue({ id: 'worker' });
|
||||||
|
|
||||||
|
await createConfiguredTesseractWorker('eng', 1, logger, {
|
||||||
|
VITE_TESSERACT_WORKER_URL:
|
||||||
|
'https://internal.example.com/ocr/worker.min.js',
|
||||||
|
VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core',
|
||||||
|
VITE_TESSERACT_LANG_URL: 'https://internal.example.com/ocr/lang-data',
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(createWorker).toHaveBeenCalledWith('eng', 1, {
|
||||||
|
logger,
|
||||||
|
workerPath: 'https://internal.example.com/ocr/worker.min.js',
|
||||||
|
corePath: 'https://internal.example.com/ocr/core',
|
||||||
|
langPath: 'https://internal.example.com/ocr/lang-data',
|
||||||
|
gzip: true,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('filters OCR language entries when the build restricts bundled languages', () => {
|
||||||
|
expect(
|
||||||
|
getAvailableTesseractLanguageEntries({
|
||||||
|
VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
|
||||||
|
})
|
||||||
|
).toEqual([
|
||||||
|
['eng', 'English'],
|
||||||
|
['deu', 'German'],
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('reports unavailable OCR languages for restricted air-gap builds', () => {
|
||||||
|
expect(
|
||||||
|
getUnavailableTesseractLanguages('eng+fra', {
|
||||||
|
VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
|
||||||
|
})
|
||||||
|
).toEqual(['fra']);
|
||||||
|
|
||||||
|
expect(() =>
|
||||||
|
assertTesseractLanguagesAvailable('eng+fra', {
|
||||||
|
VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
|
||||||
|
})
|
||||||
|
).toThrow(UnsupportedOcrLanguageError);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('blocks worker creation when OCR requests an unbundled language', async () => {
|
||||||
|
await expect(
|
||||||
|
createConfiguredTesseractWorker('fra', 1, undefined, {
|
||||||
|
VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
|
||||||
|
})
|
||||||
|
).rejects.toThrow('This BentoPDF build only bundles OCR data for');
|
||||||
|
|
||||||
|
expect(createWorker).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
});
|
||||||
14
src/types/globals.d.ts
vendored
14
src/types/globals.d.ts
vendored
@@ -1 +1,15 @@
|
|||||||
|
/// <reference types="vite/client" />
|
||||||
|
|
||||||
|
interface ImportMetaEnv {
|
||||||
|
readonly VITE_TESSERACT_WORKER_URL?: string;
|
||||||
|
readonly VITE_TESSERACT_CORE_URL?: string;
|
||||||
|
readonly VITE_TESSERACT_LANG_URL?: string;
|
||||||
|
readonly VITE_TESSERACT_AVAILABLE_LANGUAGES?: string;
|
||||||
|
readonly VITE_OCR_FONT_BASE_URL?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ImportMeta {
|
||||||
|
readonly env: ImportMetaEnv;
|
||||||
|
}
|
||||||
|
|
||||||
declare const __SIMPLE_MODE__: boolean;
|
declare const __SIMPLE_MODE__: boolean;
|
||||||
|
|||||||
Reference in New Issue
Block a user