From 5232102ac0b4ab3f898e126492bf61195091c86c Mon Sep 17 00:00:00 2001 From: alam00000 Date: Tue, 10 Mar 2026 13:47:46 +0530 Subject: [PATCH] feat: enhance PDF comparison with new change types and zoom functionality - Added support for 'moved' and 'style-changed' change types in PDF comparison. - Implemented category filters for changes, allowing users to filter by text, images, headers, annotations, formatting, and background. - Introduced zoom functionality with buttons for zooming in, out, and resetting to default. - Updated UI to reflect new change types and categories, including visual indicators for moved and style-changed items. - Enhanced summary display to include counts for moved and style-changed changes. - Refactored rendering logic to accommodate zoom levels and improve performance. - Added tests for new change detection features and category assignments. --- public/locales/ar/tools.json | 42 ++- public/locales/be/tools.json | 42 ++- public/locales/da/tools.json | 42 ++- public/locales/de/tools.json | 42 ++- public/locales/en/tools.json | 42 ++- public/locales/es/tools.json | 42 ++- public/locales/fr/tools.json | 42 ++- public/locales/id/tools.json | 42 ++- public/locales/it/tools.json | 42 ++- public/locales/ko/tools.json | 42 ++- public/locales/nl/tools.json | 42 ++- public/locales/pt/tools.json | 42 ++- public/locales/sv/tools.json | 42 ++- public/locales/tr/tools.json | 42 ++- public/locales/vi/tools.json | 42 ++- public/locales/zh-TW/tools.json | 42 ++- public/locales/zh/tools.json | 42 ++- src/js/compare/config.ts | 2 + src/js/compare/engine/compare-content.ts | 213 +++++++++++ src/js/compare/engine/compare-page-models.ts | 110 +++++- src/js/compare/engine/compare.worker.ts | 77 ++++ src/js/compare/engine/diff-text-runs.ts | 245 ++++++++++++- src/js/compare/engine/extract-page-model.ts | 174 ++++++++- src/js/compare/engine/text-normalization.ts | 26 ++ .../compare/reporting/export-compare-pdf.ts | 16 +- src/js/compare/types.ts | 70 +++- src/js/compare/worker-api.ts | 90 +++++ src/js/logic/compare-pdfs-page.ts | 207 +++++++++-- src/js/logic/compare-render.ts | 159 +++++++-- src/pages/compare-pdfs.html | 336 +++++++++++++----- src/tests/compare/diff-text-runs.test.ts | 247 ++++++++++++- 31 files changed, 2503 insertions(+), 183 deletions(-) create mode 100644 src/js/compare/engine/compare-content.ts create mode 100644 src/js/compare/engine/compare.worker.ts create mode 100644 src/js/compare/worker-api.ts diff --git a/public/locales/ar/tools.json b/public/locales/ar/tools.json index ae5082c..10ec041 100644 --- a/public/locales/ar/tools.json +++ b/public/locales/ar/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "مقارنة ملفات PDF", - "subtitle": "مقارنة ملفي PDF جنبًا إلى جنب." + "subtitle": "مقارنة ملفي PDF جنبًا إلى جنب.", + "firstPdf": "ملف PDF الأول", + "secondPdf": "ملف PDF الثاني", + "clickOrDrop": "انقر أو أفلت", + "page": "الصفحة", + "overlay": "تراكب", + "sideBySide": "جنبًا إلى جنب", + "flicker": "وميض", + "syncScroll": "مزامنة التمرير", + "export": "تصدير", + "exportAsPdf": "تصدير كملف PDF", + "splitView": "عرض مقسوم", + "alternating": "بالتناوب", + "leftDocument": "المستند الأيسر", + "rightDocument": "المستند الأيمن", + "original": "الأصلي", + "modified": "المعدل", + "searchChanges": "ابحث في التغييرات...", + "deleted": "محذوف", + "added": "مضاف", + "prevPage": "الصفحة السابقة", + "nextPage": "الصفحة التالية", + "prevChange": "التغيير السابق", + "nextChange": "التغيير التالي", + "uploadTwoPdfs": "حمّل ملفي PDF لرؤية الاختلافات.", + "noDifferences": "لم يتم اكتشاف اختلافات في هذه الصفحة.", + "noMatchingChanges": "لا توجد تغييرات تطابق عامل التصفية الحالي.", + "pageNotExist": "الصفحة {{page}} غير موجودة في ملف PDF هذا.", + "noPairedPage": "لا توجد صفحة مقترنة لهذا الجانب.", + "buildingModel": "جارٍ إنشاء نموذج إقران الصفحات...", + "indexingPdf": "جارٍ فهرسة PDF {{num}} الصفحة {{page}} من {{total}}...", + "loadingComparison": "جارٍ تحميل المقارنة {{current}} من {{total}}...", + "runningOcr": "جارٍ تشغيل OCR على الصفحة {{page}}...", + "preparingExport": "جارٍ تجهيز تصدير PDF...", + "renderingPage": "جارٍ عرض الصفحة {{current}} من {{total}}...", + "exportError": "خطأ في التصدير", + "exportFailed": "تعذر تصدير ملف PDF المقارن.", + "loadingFile": "جارٍ تحميل {{name}}...", + "invalidFile": "ملف غير صالح", + "invalidFileMsg": "يرجى اختيار ملف PDF صالح.", + "loadError": "تعذر تحميل ملف PDF. قد يكون تالفًا أو محميًا بكلمة مرور." }, "posterizePdf": { "name": "تقسيم PDF إلى ملصقات", diff --git a/public/locales/be/tools.json b/public/locales/be/tools.json index cd6f23a..2d959f1 100644 --- a/public/locales/be/tools.json +++ b/public/locales/be/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "Параўнаць PDF", - "subtitle": "Параўнаць два PDF побач." + "subtitle": "Параўнаць два PDF побач.", + "firstPdf": "Першы PDF", + "secondPdf": "Другі PDF", + "clickOrDrop": "Націсніце або перацягніце", + "page": "Старонка", + "overlay": "Накладанне", + "sideBySide": "Побач", + "flicker": "Мігценне", + "syncScroll": "Сінхранізаваць пракрутку", + "export": "Экспарт", + "exportAsPdf": "Экспартаваць як PDF", + "splitView": "Падзелены выгляд", + "alternating": "Чаргаванне", + "leftDocument": "Левы дакумент", + "rightDocument": "Правы дакумент", + "original": "Арыгінал", + "modified": "Зменены", + "searchChanges": "Шукаць змены...", + "deleted": "Выдалена", + "added": "Дададзена", + "prevPage": "Папярэдняя старонка", + "nextPage": "Наступная старонка", + "prevChange": "Папярэдняя змена", + "nextChange": "Наступная змена", + "uploadTwoPdfs": "Загрузіце два PDF, каб убачыць адрозненні.", + "noDifferences": "На гэтай старонцы адрозненняў не выяўлена.", + "noMatchingChanges": "Няма змен, што адпавядаюць бягучаму фільтру.", + "pageNotExist": "Старонка {{page}} не існуе ў гэтым PDF.", + "noPairedPage": "Для гэтага боку няма спаранай старонкі.", + "buildingModel": "Стварэнне мадэлі супастаўлення старонак...", + "indexingPdf": "Індэксацыя PDF {{num}}, старонка {{page}} з {{total}}...", + "loadingComparison": "Загрузка параўнання {{current}} з {{total}}...", + "runningOcr": "Запуск OCR на старонцы {{page}}...", + "preparingExport": "Падрыхтоўка экспарту PDF...", + "renderingPage": "Адмалёўка старонкі {{current}} з {{total}}...", + "exportError": "Памылка экспарту", + "exportFailed": "Не ўдалося экспартаваць PDF параўнання.", + "loadingFile": "Загрузка {{name}}...", + "invalidFile": "Няправільны файл", + "invalidFileMsg": "Калі ласка, абярыце сапраўдны PDF-файл.", + "loadError": "Не ўдалося загрузіць PDF. Магчыма, ён пашкоджаны або абаронены паролем." }, "posterizePdf": { "name": "Пераўтварыць у постэр", diff --git a/public/locales/da/tools.json b/public/locales/da/tools.json index 2524b97..df4c677 100644 --- a/public/locales/da/tools.json +++ b/public/locales/da/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "Sammenlign PDF’er", - "subtitle": "Sammenlign to PDF’er side om side." + "subtitle": "Sammenlign to PDF’er side om side.", + "firstPdf": "Første PDF", + "secondPdf": "Anden PDF", + "clickOrDrop": "Klik eller slip", + "page": "Side", + "overlay": "Overlejring", + "sideBySide": "Side om side", + "flicker": "Blink", + "syncScroll": "Synkroniser rulning", + "export": "Eksportér", + "exportAsPdf": "Eksportér som PDF", + "splitView": "Opdelt visning", + "alternating": "Skiftevis", + "leftDocument": "Venstre dokument", + "rightDocument": "Højre dokument", + "original": "Original", + "modified": "Ændret", + "searchChanges": "Søg ændringer...", + "deleted": "Slettet", + "added": "Tilføjet", + "prevPage": "Forrige side", + "nextPage": "Næste side", + "prevChange": "Forrige ændring", + "nextChange": "Næste ændring", + "uploadTwoPdfs": "Upload to PDF’er for at se forskellene.", + "noDifferences": "Ingen forskelle fundet på denne side.", + "noMatchingChanges": "Ingen ændringer matcher det aktuelle filter.", + "pageNotExist": "Side {{page}} findes ikke i denne PDF.", + "noPairedPage": "Ingen parret side for denne side.", + "buildingModel": "Opbygger sideparringsmodel...", + "indexingPdf": "Indekserer PDF {{num}}, side {{page}} af {{total}}...", + "loadingComparison": "Indlæser sammenligning {{current}} af {{total}}...", + "runningOcr": "Kører OCR på side {{page}}...", + "preparingExport": "Forbereder PDF-eksport...", + "renderingPage": "Renderer side {{current}} af {{total}}...", + "exportError": "Eksportfejl", + "exportFailed": "Kunne ikke eksportere sammenlignings-PDF.", + "loadingFile": "Indlæser {{name}}...", + "invalidFile": "Ugyldig fil", + "invalidFileMsg": "Vælg venligst en gyldig PDF-fil.", + "loadError": "Kunne ikke indlæse PDF. Den kan være beskadiget eller beskyttet med adgangskode." }, "posterizePdf": { "name": "Posterisér PDF", diff --git a/public/locales/de/tools.json b/public/locales/de/tools.json index ea93f3a..b818605 100644 --- a/public/locales/de/tools.json +++ b/public/locales/de/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "PDFs vergleichen", - "subtitle": "Zwei PDFs nebeneinander vergleichen." + "subtitle": "Zwei PDFs nebeneinander vergleichen.", + "firstPdf": "Erste PDF", + "secondPdf": "Zweite PDF", + "clickOrDrop": "Klicken oder ablegen", + "page": "Seite", + "overlay": "Überlagerung", + "sideBySide": "Nebeneinander", + "flicker": "Flackern", + "syncScroll": "Synchrones Scrollen", + "export": "Exportieren", + "exportAsPdf": "Als PDF exportieren", + "splitView": "Geteilte Ansicht", + "alternating": "Abwechselnd", + "leftDocument": "Linkes Dokument", + "rightDocument": "Rechtes Dokument", + "original": "Original", + "modified": "Geändert", + "searchChanges": "Änderungen suchen...", + "deleted": "Gelöscht", + "added": "Hinzugefügt", + "prevPage": "Vorherige Seite", + "nextPage": "Nächste Seite", + "prevChange": "Vorherige Änderung", + "nextChange": "Nächste Änderung", + "uploadTwoPdfs": "Laden Sie zwei PDFs hoch, um Unterschiede zu sehen.", + "noDifferences": "Auf dieser Seite wurden keine Unterschiede gefunden.", + "noMatchingChanges": "Keine Änderungen entsprechen dem aktuellen Filter.", + "pageNotExist": "Seite {{page}} existiert nicht in dieser PDF.", + "noPairedPage": "Für diese Seite gibt es keine zugeordnete Seite.", + "buildingModel": "Seitenzuordnungsmodell wird erstellt...", + "indexingPdf": "PDF {{num}}, Seite {{page}} von {{total}} wird indiziert...", + "loadingComparison": "Vergleich {{current}} von {{total}} wird geladen...", + "runningOcr": "OCR wird auf Seite {{page}} ausgeführt...", + "preparingExport": "PDF-Export wird vorbereitet...", + "renderingPage": "Seite {{current}} von {{total}} wird gerendert...", + "exportError": "Exportfehler", + "exportFailed": "Vergleichs-PDF konnte nicht exportiert werden.", + "loadingFile": "{{name}} wird geladen...", + "invalidFile": "Ungültige Datei", + "invalidFileMsg": "Bitte wählen Sie eine gültige PDF-Datei aus.", + "loadError": "PDF konnte nicht geladen werden. Sie ist möglicherweise beschädigt oder passwortgeschützt." }, "posterizePdf": { "name": "PDF posterisieren", diff --git a/public/locales/en/tools.json b/public/locales/en/tools.json index 9783bee..f9d3785 100644 --- a/public/locales/en/tools.json +++ b/public/locales/en/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "Compare PDFs", - "subtitle": "Compare two PDFs side by side." + "subtitle": "Compare two PDFs side by side.", + "firstPdf": "First PDF", + "secondPdf": "Second PDF", + "clickOrDrop": "Click or drop", + "page": "Page", + "overlay": "Overlay", + "sideBySide": "Side-by-Side", + "flicker": "Flicker", + "syncScroll": "Sync scroll", + "export": "Export", + "exportAsPdf": "Export as PDF", + "splitView": "Split view", + "alternating": "Alternating", + "leftDocument": "Left Document", + "rightDocument": "Right Document", + "original": "Original", + "modified": "Modified", + "searchChanges": "Search changes...", + "deleted": "Deleted", + "added": "Added", + "prevPage": "Previous page", + "nextPage": "Next page", + "prevChange": "Previous change", + "nextChange": "Next change", + "uploadTwoPdfs": "Upload two PDFs to see differences.", + "noDifferences": "No differences detected on this page.", + "noMatchingChanges": "No changes match the current filter.", + "pageNotExist": "Page {{page}} does not exist in this PDF.", + "noPairedPage": "No paired page for this side.", + "buildingModel": "Building page pairing model...", + "indexingPdf": "Indexing PDF {{num}} page {{page}} of {{total}}...", + "loadingComparison": "Loading comparison {{current}} of {{total}}...", + "runningOcr": "Running OCR on page {{page}}...", + "preparingExport": "Preparing PDF export...", + "renderingPage": "Rendering page {{current}} of {{total}}...", + "exportError": "Export Error", + "exportFailed": "Could not export comparison PDF.", + "loadingFile": "Loading {{name}}...", + "invalidFile": "Invalid File", + "invalidFileMsg": "Please select a valid PDF file.", + "loadError": "Could not load PDF. It may be corrupt or password-protected." }, "posterizePdf": { "name": "Posterize PDF", diff --git a/public/locales/es/tools.json b/public/locales/es/tools.json index 1ca9a1f..bd4671d 100644 --- a/public/locales/es/tools.json +++ b/public/locales/es/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "Comparar PDFs", - "subtitle": "Compara dos PDFs lado a lado." + "subtitle": "Compara dos PDFs lado a lado.", + "firstPdf": "Primer PDF", + "secondPdf": "Segundo PDF", + "clickOrDrop": "Haz clic o suelta", + "page": "Página", + "overlay": "Superposición", + "sideBySide": "Lado a lado", + "flicker": "Parpadeo", + "syncScroll": "Sincronizar desplazamiento", + "export": "Exportar", + "exportAsPdf": "Exportar como PDF", + "splitView": "Vista dividida", + "alternating": "Alternando", + "leftDocument": "Documento izquierdo", + "rightDocument": "Documento derecho", + "original": "Original", + "modified": "Modificado", + "searchChanges": "Buscar cambios...", + "deleted": "Eliminado", + "added": "Añadido", + "prevPage": "Página anterior", + "nextPage": "Página siguiente", + "prevChange": "Cambio anterior", + "nextChange": "Cambio siguiente", + "uploadTwoPdfs": "Sube dos PDFs para ver las diferencias.", + "noDifferences": "No se detectaron diferencias en esta página.", + "noMatchingChanges": "Ningún cambio coincide con el filtro actual.", + "pageNotExist": "La página {{page}} no existe en este PDF.", + "noPairedPage": "No hay una página emparejada para este lado.", + "buildingModel": "Creando el modelo de emparejamiento de páginas...", + "indexingPdf": "Indexando PDF {{num}}, página {{page}} de {{total}}...", + "loadingComparison": "Cargando comparación {{current}} de {{total}}...", + "runningOcr": "Ejecutando OCR en la página {{page}}...", + "preparingExport": "Preparando la exportación del PDF...", + "renderingPage": "Renderizando página {{current}} de {{total}}...", + "exportError": "Error de exportación", + "exportFailed": "No se pudo exportar el PDF de comparación.", + "loadingFile": "Cargando {{name}}...", + "invalidFile": "Archivo no válido", + "invalidFileMsg": "Selecciona un archivo PDF válido.", + "loadError": "No se pudo cargar el PDF. Puede estar dañado o protegido con contraseña." }, "posterizePdf": { "name": "Posterizar PDF", diff --git a/public/locales/fr/tools.json b/public/locales/fr/tools.json index 4a208dd..f03d44c 100644 --- a/public/locales/fr/tools.json +++ b/public/locales/fr/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "Comparer des PDF", - "subtitle": "Comparer deux PDF côte à côte." + "subtitle": "Comparer deux PDF côte à côte.", + "firstPdf": "Premier PDF", + "secondPdf": "Deuxième PDF", + "clickOrDrop": "Cliquer ou déposer", + "page": "Page", + "overlay": "Superposition", + "sideBySide": "Côte à côte", + "flicker": "Clignotement", + "syncScroll": "Synchroniser le défilement", + "export": "Exporter", + "exportAsPdf": "Exporter en PDF", + "splitView": "Vue divisée", + "alternating": "Alterné", + "leftDocument": "Document de gauche", + "rightDocument": "Document de droite", + "original": "Original", + "modified": "Modifié", + "searchChanges": "Rechercher des modifications...", + "deleted": "Supprimé", + "added": "Ajouté", + "prevPage": "Page précédente", + "nextPage": "Page suivante", + "prevChange": "Modification précédente", + "nextChange": "Modification suivante", + "uploadTwoPdfs": "Téléversez deux PDF pour voir les différences.", + "noDifferences": "Aucune différence détectée sur cette page.", + "noMatchingChanges": "Aucune modification ne correspond au filtre actuel.", + "pageNotExist": "La page {{page}} n’existe pas dans ce PDF.", + "noPairedPage": "Aucune page associée pour ce côté.", + "buildingModel": "Création du modèle d’appariement des pages...", + "indexingPdf": "Indexation du PDF {{num}}, page {{page}} sur {{total}}...", + "loadingComparison": "Chargement de la comparaison {{current}} sur {{total}}...", + "runningOcr": "Exécution de l’OCR sur la page {{page}}...", + "preparingExport": "Préparation de l’export PDF...", + "renderingPage": "Rendu de la page {{current}} sur {{total}}...", + "exportError": "Erreur d’export", + "exportFailed": "Impossible d’exporter le PDF de comparaison.", + "loadingFile": "Chargement de {{name}}...", + "invalidFile": "Fichier invalide", + "invalidFileMsg": "Veuillez sélectionner un fichier PDF valide.", + "loadError": "Impossible de charger le PDF. Il est peut-être corrompu ou protégé par mot de passe." }, "posterizePdf": { "name": "Posteriser un PDF", diff --git a/public/locales/id/tools.json b/public/locales/id/tools.json index b75ea0f..bf548a7 100644 --- a/public/locales/id/tools.json +++ b/public/locales/id/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "Bandingkan PDF", - "subtitle": "Bandingkan dua PDF berdampingan." + "subtitle": "Bandingkan dua PDF berdampingan.", + "firstPdf": "PDF pertama", + "secondPdf": "PDF kedua", + "clickOrDrop": "Klik atau letakkan", + "page": "Halaman", + "overlay": "Hamparan", + "sideBySide": "Berdampingan", + "flicker": "Kedip", + "syncScroll": "Sinkronkan gulir", + "export": "Ekspor", + "exportAsPdf": "Ekspor sebagai PDF", + "splitView": "Tampilan terbagi", + "alternating": "Bergantian", + "leftDocument": "Dokumen kiri", + "rightDocument": "Dokumen kanan", + "original": "Asli", + "modified": "Diubah", + "searchChanges": "Cari perubahan...", + "deleted": "Dihapus", + "added": "Ditambahkan", + "prevPage": "Halaman sebelumnya", + "nextPage": "Halaman berikutnya", + "prevChange": "Perubahan sebelumnya", + "nextChange": "Perubahan berikutnya", + "uploadTwoPdfs": "Unggah dua PDF untuk melihat perbedaannya.", + "noDifferences": "Tidak ada perbedaan yang terdeteksi pada halaman ini.", + "noMatchingChanges": "Tidak ada perubahan yang cocok dengan filter saat ini.", + "pageNotExist": "Halaman {{page}} tidak ada di PDF ini.", + "noPairedPage": "Tidak ada halaman pasangan untuk sisi ini.", + "buildingModel": "Membangun model pemasangan halaman...", + "indexingPdf": "Mengindeks PDF {{num}} halaman {{page}} dari {{total}}...", + "loadingComparison": "Memuat perbandingan {{current}} dari {{total}}...", + "runningOcr": "Menjalankan OCR pada halaman {{page}}...", + "preparingExport": "Menyiapkan ekspor PDF...", + "renderingPage": "Merender halaman {{current}} dari {{total}}...", + "exportError": "Kesalahan ekspor", + "exportFailed": "Tidak dapat mengekspor PDF perbandingan.", + "loadingFile": "Memuat {{name}}...", + "invalidFile": "File tidak valid", + "invalidFileMsg": "Silakan pilih file PDF yang valid.", + "loadError": "Tidak dapat memuat PDF. Mungkin rusak atau dilindungi kata sandi." }, "posterizePdf": { "name": "Posterisasi PDF", diff --git a/public/locales/it/tools.json b/public/locales/it/tools.json index 2736bc5..0945292 100644 --- a/public/locales/it/tools.json +++ b/public/locales/it/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "Confronta PDF", - "subtitle": "Confronta due PDF fianco a fianco." + "subtitle": "Confronta due PDF fianco a fianco.", + "firstPdf": "Primo PDF", + "secondPdf": "Secondo PDF", + "clickOrDrop": "Clicca o rilascia", + "page": "Pagina", + "overlay": "Sovrapposizione", + "sideBySide": "Affiancato", + "flicker": "Lampeggio", + "syncScroll": "Sincronizza scorrimento", + "export": "Esporta", + "exportAsPdf": "Esporta come PDF", + "splitView": "Vista divisa", + "alternating": "Alternato", + "leftDocument": "Documento sinistro", + "rightDocument": "Documento destro", + "original": "Originale", + "modified": "Modificato", + "searchChanges": "Cerca modifiche...", + "deleted": "Eliminato", + "added": "Aggiunto", + "prevPage": "Pagina precedente", + "nextPage": "Pagina successiva", + "prevChange": "Modifica precedente", + "nextChange": "Modifica successiva", + "uploadTwoPdfs": "Carica due PDF per vedere le differenze.", + "noDifferences": "Nessuna differenza rilevata in questa pagina.", + "noMatchingChanges": "Nessuna modifica corrisponde al filtro corrente.", + "pageNotExist": "La pagina {{page}} non esiste in questo PDF.", + "noPairedPage": "Nessuna pagina associata per questo lato.", + "buildingModel": "Creazione del modello di abbinamento pagine...", + "indexingPdf": "Indicizzazione del PDF {{num}}, pagina {{page}} di {{total}}...", + "loadingComparison": "Caricamento confronto {{current}} di {{total}}...", + "runningOcr": "Esecuzione OCR sulla pagina {{page}}...", + "preparingExport": "Preparazione esportazione PDF...", + "renderingPage": "Rendering pagina {{current}} di {{total}}...", + "exportError": "Errore di esportazione", + "exportFailed": "Impossibile esportare il PDF di confronto.", + "loadingFile": "Caricamento di {{name}}...", + "invalidFile": "File non valido", + "invalidFileMsg": "Seleziona un file PDF valido.", + "loadError": "Impossibile caricare il PDF. Potrebbe essere danneggiato o protetto da password." }, "posterizePdf": { "name": "Posterizza PDF", diff --git a/public/locales/ko/tools.json b/public/locales/ko/tools.json index 7991f5f..daca6d5 100644 --- a/public/locales/ko/tools.json +++ b/public/locales/ko/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "PDF 비교", - "subtitle": "두 PDF를 나란히 비교합니다." + "subtitle": "두 PDF를 나란히 비교합니다.", + "firstPdf": "첫 번째 PDF", + "secondPdf": "두 번째 PDF", + "clickOrDrop": "클릭 또는 드롭", + "page": "페이지", + "overlay": "오버레이", + "sideBySide": "나란히 보기", + "flicker": "깜빡임", + "syncScroll": "스크롤 동기화", + "export": "내보내기", + "exportAsPdf": "PDF로 내보내기", + "splitView": "분할 보기", + "alternating": "번갈아 보기", + "leftDocument": "왼쪽 문서", + "rightDocument": "오른쪽 문서", + "original": "원본", + "modified": "수정본", + "searchChanges": "변경 사항 검색...", + "deleted": "삭제됨", + "added": "추가됨", + "prevPage": "이전 페이지", + "nextPage": "다음 페이지", + "prevChange": "이전 변경", + "nextChange": "다음 변경", + "uploadTwoPdfs": "차이점을 보려면 두 개의 PDF를 업로드하세요.", + "noDifferences": "이 페이지에서 차이점이 감지되지 않았습니다.", + "noMatchingChanges": "현재 필터와 일치하는 변경 사항이 없습니다.", + "pageNotExist": "페이지 {{page}}는 이 PDF에 존재하지 않습니다.", + "noPairedPage": "이쪽에 대응되는 페이지가 없습니다.", + "buildingModel": "페이지 페어링 모델을 만드는 중...", + "indexingPdf": "PDF {{num}}의 {{page}} / {{total}} 페이지를 인덱싱하는 중...", + "loadingComparison": "비교 {{current}} / {{total}} 불러오는 중...", + "runningOcr": "페이지 {{page}}에서 OCR 실행 중...", + "preparingExport": "PDF 내보내기 준비 중...", + "renderingPage": "페이지 {{current}} / {{total}} 렌더링 중...", + "exportError": "내보내기 오류", + "exportFailed": "비교 PDF를 내보낼 수 없습니다.", + "loadingFile": "{{name}} 불러오는 중...", + "invalidFile": "잘못된 파일", + "invalidFileMsg": "유효한 PDF 파일을 선택하세요.", + "loadError": "PDF를 불러올 수 없습니다. 손상되었거나 비밀번호로 보호되었을 수 있습니다." }, "posterizePdf": { "name": "PDF 포스터화", diff --git a/public/locales/nl/tools.json b/public/locales/nl/tools.json index c50eeca..624f658 100644 --- a/public/locales/nl/tools.json +++ b/public/locales/nl/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "PDF's Vergelijken", - "subtitle": "Twee PDF's zij-aan-zij vergelijken." + "subtitle": "Twee PDF's zij-aan-zij vergelijken.", + "firstPdf": "Eerste PDF", + "secondPdf": "Tweede PDF", + "clickOrDrop": "Klik of sleep neer", + "page": "Pagina", + "overlay": "Overlay", + "sideBySide": "Zij aan zij", + "flicker": "Flikkeren", + "syncScroll": "Scroll synchroniseren", + "export": "Exporteren", + "exportAsPdf": "Exporteren als PDF", + "splitView": "Gesplitste weergave", + "alternating": "Afwisselend", + "leftDocument": "Linkerdocument", + "rightDocument": "Rechterdocument", + "original": "Origineel", + "modified": "Gewijzigd", + "searchChanges": "Wijzigingen zoeken...", + "deleted": "Verwijderd", + "added": "Toegevoegd", + "prevPage": "Vorige pagina", + "nextPage": "Volgende pagina", + "prevChange": "Vorige wijziging", + "nextChange": "Volgende wijziging", + "uploadTwoPdfs": "Upload twee PDF's om de verschillen te zien.", + "noDifferences": "Geen verschillen gedetecteerd op deze pagina.", + "noMatchingChanges": "Geen wijzigingen komen overeen met het huidige filter.", + "pageNotExist": "Pagina {{page}} bestaat niet in deze PDF.", + "noPairedPage": "Geen gekoppelde pagina voor deze zijde.", + "buildingModel": "Model voor paginakoppeling wordt opgebouwd...", + "indexingPdf": "PDF {{num}}, pagina {{page}} van {{total}} wordt geïndexeerd...", + "loadingComparison": "Vergelijking {{current}} van {{total}} wordt geladen...", + "runningOcr": "OCR wordt uitgevoerd op pagina {{page}}...", + "preparingExport": "PDF-export wordt voorbereid...", + "renderingPage": "Pagina {{current}} van {{total}} wordt gerenderd...", + "exportError": "Exportfout", + "exportFailed": "Vergelijkings-PDF kon niet worden geëxporteerd.", + "loadingFile": "{{name}} wordt geladen...", + "invalidFile": "Ongeldig bestand", + "invalidFileMsg": "Selecteer een geldig PDF-bestand.", + "loadError": "Kon PDF niet laden. Het bestand kan beschadigd zijn of met een wachtwoord beveiligd zijn." }, "posterizePdf": { "name": "PDF-Poster", diff --git a/public/locales/pt/tools.json b/public/locales/pt/tools.json index 1a16138..24eaafe 100644 --- a/public/locales/pt/tools.json +++ b/public/locales/pt/tools.json @@ -288,7 +288,47 @@ }, "comparePdfs": { "name": "Comparar PDFs", - "subtitle": "Compare dois PDFs lado a lado." + "subtitle": "Compare dois PDFs lado a lado.", + "firstPdf": "Primeiro PDF", + "secondPdf": "Segundo PDF", + "clickOrDrop": "Clique ou solte", + "page": "Página", + "overlay": "Sobreposição", + "sideBySide": "Lado a lado", + "flicker": "Alternância rápida", + "syncScroll": "Sincronizar rolagem", + "export": "Exportar", + "exportAsPdf": "Exportar como PDF", + "splitView": "Visualização dividida", + "alternating": "Alternado", + "leftDocument": "Documento esquerdo", + "rightDocument": "Documento direito", + "original": "Original", + "modified": "Modificado", + "searchChanges": "Pesquisar alterações...", + "deleted": "Excluído", + "added": "Adicionado", + "prevPage": "Página anterior", + "nextPage": "Próxima página", + "prevChange": "Alteração anterior", + "nextChange": "Próxima alteração", + "uploadTwoPdfs": "Envie dois PDFs para ver as diferenças.", + "noDifferences": "Nenhuma diferença detectada nesta página.", + "noMatchingChanges": "Nenhuma alteração corresponde ao filtro atual.", + "pageNotExist": "A página {{page}} não existe neste PDF.", + "noPairedPage": "Não há página pareada para este lado.", + "buildingModel": "Criando modelo de pareamento de páginas...", + "indexingPdf": "Indexando PDF {{num}}, página {{page}} de {{total}}...", + "loadingComparison": "Carregando comparação {{current}} de {{total}}...", + "runningOcr": "Executando OCR na página {{page}}...", + "preparingExport": "Preparando exportação em PDF...", + "renderingPage": "Renderizando página {{current}} de {{total}}...", + "exportError": "Erro de exportação", + "exportFailed": "Não foi possível exportar o PDF de comparação.", + "loadingFile": "Carregando {{name}}...", + "invalidFile": "Arquivo inválido", + "invalidFileMsg": "Selecione um arquivo PDF válido.", + "loadError": "Não foi possível carregar o PDF. Ele pode estar corrompido ou protegido por senha." }, "posterizePdf": { "name": "Posterizar PDF", diff --git a/public/locales/sv/tools.json b/public/locales/sv/tools.json index d35aa9e..2f02976 100644 --- a/public/locales/sv/tools.json +++ b/public/locales/sv/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "Jämför PDF:er", - "subtitle": "Jämför två PDF:er bredvid varandra." + "subtitle": "Jämför två PDF:er bredvid varandra.", + "firstPdf": "Första PDF", + "secondPdf": "Andra PDF", + "clickOrDrop": "Klicka eller släpp", + "page": "Sida", + "overlay": "Överlägg", + "sideBySide": "Sida vid sida", + "flicker": "Flimmer", + "syncScroll": "Synkronisera rullning", + "export": "Exportera", + "exportAsPdf": "Exportera som PDF", + "splitView": "Delad vy", + "alternating": "Växlande", + "leftDocument": "Vänster dokument", + "rightDocument": "Höger dokument", + "original": "Original", + "modified": "Ändrad", + "searchChanges": "Sök ändringar...", + "deleted": "Borttagen", + "added": "Tillagd", + "prevPage": "Föregående sida", + "nextPage": "Nästa sida", + "prevChange": "Föregående ändring", + "nextChange": "Nästa ändring", + "uploadTwoPdfs": "Ladda upp två PDF:er för att se skillnaderna.", + "noDifferences": "Inga skillnader upptäcktes på denna sida.", + "noMatchingChanges": "Inga ändringar matchar det aktuella filtret.", + "pageNotExist": "Sidan {{page}} finns inte i denna PDF.", + "noPairedPage": "Ingen matchad sida för denna sida.", + "buildingModel": "Bygger sidparningsmodell...", + "indexingPdf": "Indexerar PDF {{num}}, sida {{page}} av {{total}}...", + "loadingComparison": "Läser in jämförelse {{current}} av {{total}}...", + "runningOcr": "Kör OCR på sida {{page}}...", + "preparingExport": "Förbereder PDF-export...", + "renderingPage": "Renderar sida {{current}} av {{total}}...", + "exportError": "Exportfel", + "exportFailed": "Kunde inte exportera jämförelse-PDF.", + "loadingFile": "Läser in {{name}}...", + "invalidFile": "Ogiltig fil", + "invalidFileMsg": "Välj en giltig PDF-fil.", + "loadError": "Kunde inte läsa in PDF. Den kan vara skadad eller lösenordsskyddad." }, "posterizePdf": { "name": "Postera PDF", diff --git a/public/locales/tr/tools.json b/public/locales/tr/tools.json index 35a207a..a1f2eee 100644 --- a/public/locales/tr/tools.json +++ b/public/locales/tr/tools.json @@ -288,7 +288,47 @@ }, "comparePdfs": { "name": "PDF'leri Karşılaştır", - "subtitle": "İki PDF'yi yan yana karşılaştırın." + "subtitle": "İki PDF'yi yan yana karşılaştırın.", + "firstPdf": "İlk PDF", + "secondPdf": "İkinci PDF", + "clickOrDrop": "Tıklayın veya bırakın", + "page": "Sayfa", + "overlay": "Üst üste", + "sideBySide": "Yan yana", + "flicker": "Titreşim", + "syncScroll": "Kaydırmayı senkronize et", + "export": "Dışa aktar", + "exportAsPdf": "PDF olarak dışa aktar", + "splitView": "Bölünmüş görünüm", + "alternating": "Sırayla", + "leftDocument": "Sol belge", + "rightDocument": "Sağ belge", + "original": "Orijinal", + "modified": "Değiştirilmiş", + "searchChanges": "Değişiklikleri ara...", + "deleted": "Silindi", + "added": "Eklendi", + "prevPage": "Önceki sayfa", + "nextPage": "Sonraki sayfa", + "prevChange": "Önceki değişiklik", + "nextChange": "Sonraki değişiklik", + "uploadTwoPdfs": "Farkları görmek için iki PDF yükleyin.", + "noDifferences": "Bu sayfada fark algılanmadı.", + "noMatchingChanges": "Geçerli filtreyle eşleşen değişiklik yok.", + "pageNotExist": "{{page}} sayfası bu PDF'de yok.", + "noPairedPage": "Bu taraf için eşleştirilmiş sayfa yok.", + "buildingModel": "Sayfa eşleştirme modeli oluşturuluyor...", + "indexingPdf": "PDF {{num}} için {{total}} içinden {{page}}. sayfa dizinleniyor...", + "loadingComparison": "{{total}} içinden {{current}}. karşılaştırma yükleniyor...", + "runningOcr": "{{page}}. sayfada OCR çalıştırılıyor...", + "preparingExport": "PDF dışa aktarma hazırlanıyor...", + "renderingPage": "{{total}} içinden {{current}}. sayfa işleniyor...", + "exportError": "Dışa aktarma hatası", + "exportFailed": "Karşılaştırma PDF'i dışa aktarılamadı.", + "loadingFile": "{{name}} yükleniyor...", + "invalidFile": "Geçersiz dosya", + "invalidFileMsg": "Lütfen geçerli bir PDF dosyası seçin.", + "loadError": "PDF yüklenemedi. Bozuk olabilir veya parola korumalı olabilir." }, "posterizePdf": { "name": "PDF'yi Posta Boyutuna Böl", diff --git a/public/locales/vi/tools.json b/public/locales/vi/tools.json index fcdd960..088bbc5 100644 --- a/public/locales/vi/tools.json +++ b/public/locales/vi/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "So sánh PDF", - "subtitle": "So sánh hai PDF cạnh nhau." + "subtitle": "So sánh hai PDF cạnh nhau.", + "firstPdf": "PDF thứ nhất", + "secondPdf": "PDF thứ hai", + "clickOrDrop": "Nhấp hoặc thả", + "page": "Trang", + "overlay": "Chồng lớp", + "sideBySide": "Cạnh nhau", + "flicker": "Nhấp nháy", + "syncScroll": "Đồng bộ cuộn", + "export": "Xuất", + "exportAsPdf": "Xuất dưới dạng PDF", + "splitView": "Chế độ chia đôi", + "alternating": "Luân phiên", + "leftDocument": "Tài liệu bên trái", + "rightDocument": "Tài liệu bên phải", + "original": "Bản gốc", + "modified": "Đã sửa đổi", + "searchChanges": "Tìm kiếm thay đổi...", + "deleted": "Đã xóa", + "added": "Đã thêm", + "prevPage": "Trang trước", + "nextPage": "Trang sau", + "prevChange": "Thay đổi trước", + "nextChange": "Thay đổi sau", + "uploadTwoPdfs": "Tải lên hai PDF để xem sự khác biệt.", + "noDifferences": "Không phát hiện khác biệt trên trang này.", + "noMatchingChanges": "Không có thay đổi nào khớp với bộ lọc hiện tại.", + "pageNotExist": "Trang {{page}} không tồn tại trong PDF này.", + "noPairedPage": "Không có trang ghép cho phía này.", + "buildingModel": "Đang xây dựng mô hình ghép trang...", + "indexingPdf": "Đang lập chỉ mục PDF {{num}}, trang {{page}} trên {{total}}...", + "loadingComparison": "Đang tải so sánh {{current}} trên {{total}}...", + "runningOcr": "Đang chạy OCR trên trang {{page}}...", + "preparingExport": "Đang chuẩn bị xuất PDF...", + "renderingPage": "Đang kết xuất trang {{current}} trên {{total}}...", + "exportError": "Lỗi xuất", + "exportFailed": "Không thể xuất PDF so sánh.", + "loadingFile": "Đang tải {{name}}...", + "invalidFile": "Tệp không hợp lệ", + "invalidFileMsg": "Vui lòng chọn tệp PDF hợp lệ.", + "loadError": "Không thể tải PDF. Có thể tệp bị hỏng hoặc được bảo vệ bằng mật khẩu." }, "posterizePdf": { "name": "Posterize PDF", diff --git a/public/locales/zh-TW/tools.json b/public/locales/zh-TW/tools.json index b129178..1cfb7e6 100644 --- a/public/locales/zh-TW/tools.json +++ b/public/locales/zh-TW/tools.json @@ -288,7 +288,47 @@ }, "comparePdfs": { "name": "比較 PDF", - "subtitle": "並排比較兩個 PDF。" + "subtitle": "並排比較兩個 PDF。", + "firstPdf": "第一個 PDF", + "secondPdf": "第二個 PDF", + "clickOrDrop": "點擊或拖放", + "page": "頁面", + "overlay": "疊加", + "sideBySide": "並排", + "flicker": "閃爍", + "syncScroll": "同步捲動", + "export": "匯出", + "exportAsPdf": "匯出為 PDF", + "splitView": "分割檢視", + "alternating": "交替", + "leftDocument": "左側文件", + "rightDocument": "右側文件", + "original": "原始", + "modified": "修改後", + "searchChanges": "搜尋變更...", + "deleted": "已刪除", + "added": "已新增", + "prevPage": "上一頁", + "nextPage": "下一頁", + "prevChange": "上一個變更", + "nextChange": "下一個變更", + "uploadTwoPdfs": "上傳兩個 PDF 以查看差異。", + "noDifferences": "此頁面未偵測到差異。", + "noMatchingChanges": "沒有符合目前篩選條件的變更。", + "pageNotExist": "此 PDF 中不存在第 {{page}} 頁。", + "noPairedPage": "此側沒有配對頁面。", + "buildingModel": "正在建立頁面配對模型...", + "indexingPdf": "正在索引 PDF {{num}},第 {{page}} / {{total}} 頁...", + "loadingComparison": "正在載入比較 {{current}} / {{total}}...", + "runningOcr": "正在對第 {{page}} 頁執行 OCR...", + "preparingExport": "正在準備 PDF 匯出...", + "renderingPage": "正在轉譯第 {{current}} / {{total}} 頁...", + "exportError": "匯出錯誤", + "exportFailed": "無法匯出比較 PDF。", + "loadingFile": "正在載入 {{name}}...", + "invalidFile": "無效檔案", + "invalidFileMsg": "請選擇有效的 PDF 檔案。", + "loadError": "無法載入 PDF。檔案可能已損毀或受密碼保護。" }, "posterizePdf": { "name": "海報化 PDF", diff --git a/public/locales/zh/tools.json b/public/locales/zh/tools.json index d867a35..adefe6a 100644 --- a/public/locales/zh/tools.json +++ b/public/locales/zh/tools.json @@ -292,7 +292,47 @@ }, "comparePdfs": { "name": "比较 PDF", - "subtitle": "并排比较两个 PDF。" + "subtitle": "并排比较两个 PDF。", + "firstPdf": "第一个 PDF", + "secondPdf": "第二个 PDF", + "clickOrDrop": "点击或拖放", + "page": "页面", + "overlay": "叠加", + "sideBySide": "并排", + "flicker": "闪烁", + "syncScroll": "同步滚动", + "export": "导出", + "exportAsPdf": "导出为 PDF", + "splitView": "分屏视图", + "alternating": "交替", + "leftDocument": "左侧文档", + "rightDocument": "右侧文档", + "original": "原始", + "modified": "修改后", + "searchChanges": "搜索更改...", + "deleted": "已删除", + "added": "已添加", + "prevPage": "上一页", + "nextPage": "下一页", + "prevChange": "上一处更改", + "nextChange": "下一处更改", + "uploadTwoPdfs": "上传两个 PDF 以查看差异。", + "noDifferences": "此页面未检测到差异。", + "noMatchingChanges": "没有与当前筛选条件匹配的更改。", + "pageNotExist": "此 PDF 中不存在第 {{page}} 页。", + "noPairedPage": "此侧没有配对页面。", + "buildingModel": "正在构建页面配对模型...", + "indexingPdf": "正在索引 PDF {{num}},第 {{page}} / {{total}} 页...", + "loadingComparison": "正在加载比较 {{current}} / {{total}}...", + "runningOcr": "正在对第 {{page}} 页运行 OCR...", + "preparingExport": "正在准备 PDF 导出...", + "renderingPage": "正在渲染第 {{current}} / {{total}} 页...", + "exportError": "导出错误", + "exportFailed": "无法导出比较 PDF。", + "loadingFile": "正在加载 {{name}}...", + "invalidFile": "无效文件", + "invalidFileMsg": "请选择有效的 PDF 文件。", + "loadError": "无法加载 PDF。文件可能已损坏或受密码保护。" }, "posterizePdf": { "name": "海报化 PDF", diff --git a/src/js/compare/config.ts b/src/js/compare/config.ts index e0ccc59..42f6653 100644 --- a/src/js/compare/config.ts +++ b/src/js/compare/config.ts @@ -2,6 +2,8 @@ export const COMPARE_COLORS = { added: { r: 34, g: 197, b: 94 }, removed: { r: 239, g: 68, b: 68 }, modified: { r: 245, g: 158, b: 11 }, + moved: { r: 168, g: 85, b: 247 }, + 'style-changed': { r: 59, g: 130, b: 246 }, } as const; export const HIGHLIGHT_OPACITY = 0.28; diff --git a/src/js/compare/engine/compare-content.ts b/src/js/compare/engine/compare-content.ts new file mode 100644 index 0000000..2bd8155 --- /dev/null +++ b/src/js/compare/engine/compare-content.ts @@ -0,0 +1,213 @@ +import type { + CompareAnnotation, + CompareContentCategory, + CompareImageRef, + ComparePageModel, + CompareRectangle, + CompareTextChange, +} from '../types.ts'; + +const HEADER_FOOTER_ZONE = 0.12; + +export function classifyChangeCategory( + change: CompareTextChange, + pageHeight: number +): CompareContentCategory { + if (change.type === 'style-changed') return 'formatting'; + + const rects = + change.beforeRects.length > 0 ? change.beforeRects : change.afterRects; + if (rects.length > 0 && isHeaderFooterZone(rects, pageHeight)) { + return 'header-footer'; + } + + return 'text'; +} + +function isHeaderFooterZone( + rects: CompareRectangle[], + pageHeight: number +): boolean { + const headerThreshold = pageHeight * HEADER_FOOTER_ZONE; + const footerThreshold = pageHeight * (1 - HEADER_FOOTER_ZONE); + return rects.every( + (r) => r.y < headerThreshold || r.y + r.height > footerThreshold + ); +} + +export function diffAnnotations( + before: CompareAnnotation[], + after: CompareAnnotation[], + baseId: number +): CompareTextChange[] { + const changes: CompareTextChange[] = []; + const beforeMap = new Map(before.map((a) => [annotationKey(a), a])); + const afterMap = new Map(after.map((a) => [annotationKey(a), a])); + + let idx = baseId; + for (const [key, ann] of beforeMap) { + if (!afterMap.has(key)) { + changes.push({ + id: `annotation-removed-${idx++}`, + type: 'removed', + category: 'annotation', + description: + `Removed ${ann.subtype} annotation: "${ann.contents || ann.title || ''}"`.trim(), + beforeText: ann.contents || ann.title || '', + afterText: '', + beforeRects: [ann.rect], + afterRects: [], + }); + } + } + + for (const [key, ann] of afterMap) { + if (!beforeMap.has(key)) { + changes.push({ + id: `annotation-added-${idx++}`, + type: 'added', + category: 'annotation', + description: + `Added ${ann.subtype} annotation: "${ann.contents || ann.title || ''}"`.trim(), + beforeText: '', + afterText: ann.contents || ann.title || '', + beforeRects: [], + afterRects: [ann.rect], + }); + } + } + + return changes; +} + +function annotationKey(ann: CompareAnnotation): string { + return `${ann.subtype}|${ann.contents}|${Math.round(ann.rect.x)},${Math.round(ann.rect.y)}`; +} + +export function diffImages( + before: CompareImageRef[], + after: CompareImageRef[], + baseId: number +): CompareTextChange[] { + const changes: CompareTextChange[] = []; + const matched = new Set(); + let idx = baseId; + + for (const bImg of before) { + const match = after.find( + (aImg) => !matched.has(aImg.id) && imagesOverlap(bImg.rect, aImg.rect) + ); + if (match) { + matched.add(match.id); + if (bImg.width !== match.width || bImg.height !== match.height) { + changes.push({ + id: `image-modified-${idx++}`, + type: 'modified', + category: 'image', + description: `Image resized from ${bImg.width}×${bImg.height} to ${match.width}×${match.height}`, + beforeText: `${bImg.width}×${bImg.height}`, + afterText: `${match.width}×${match.height}`, + beforeRects: [bImg.rect], + afterRects: [match.rect], + }); + } + } else { + changes.push({ + id: `image-removed-${idx++}`, + type: 'removed', + category: 'image', + description: `Removed image (${bImg.width}×${bImg.height})`, + beforeText: '', + afterText: '', + beforeRects: [bImg.rect], + afterRects: [], + }); + } + } + + for (const aImg of after) { + if (!matched.has(aImg.id)) { + changes.push({ + id: `image-added-${idx++}`, + type: 'added', + category: 'image', + description: `Added image (${aImg.width}×${aImg.height})`, + beforeText: '', + afterText: '', + beforeRects: [], + afterRects: [aImg.rect], + }); + } + } + + return changes; +} + +function imagesOverlap(a: CompareRectangle, b: CompareRectangle): boolean { + const overlapX = Math.max( + 0, + Math.min(a.x + a.width, b.x + b.width) - Math.max(a.x, b.x) + ); + const overlapY = Math.max( + 0, + Math.min(a.y + a.height, b.y + b.height) - Math.max(a.y, b.y) + ); + const overlapArea = overlapX * overlapY; + const aArea = a.width * a.height; + const bArea = b.width * b.height; + const smallerArea = Math.min(aArea, bArea); + return smallerArea > 0 && overlapArea / smallerArea > 0.3; +} + +export function detectBackgroundChanges( + leftModel: ComparePageModel, + rightModel: ComparePageModel, + visualMismatchRatio: number, + textChangeRects: CompareRectangle[], + baseId: number +): CompareTextChange[] { + if (visualMismatchRatio < 0.01) return []; + + const textCoverage = textChangeRects.reduce( + (sum, r) => sum + r.width * r.height, + 0 + ); + const pageArea = leftModel.width * leftModel.height; + const textRatio = pageArea > 0 ? textCoverage / pageArea : 0; + + if (visualMismatchRatio > textRatio + 0.05) { + return [ + { + id: `background-changed-${baseId}`, + type: 'modified', + category: 'background', + description: 'Page background or layout changed', + beforeText: '', + afterText: '', + beforeRects: [ + { x: 0, y: 0, width: leftModel.width, height: leftModel.height }, + ], + afterRects: [ + { x: 0, y: 0, width: rightModel.width, height: rightModel.height }, + ], + }, + ]; + } + + return []; +} + +export function buildCategorySummary(changes: CompareTextChange[]) { + const summary = { + text: 0, + image: 0, + 'header-footer': 0, + annotation: 0, + formatting: 0, + background: 0, + }; + for (const c of changes) { + summary[c.category] += 1; + } + return summary; +} diff --git a/src/js/compare/engine/compare-page-models.ts b/src/js/compare/engine/compare-page-models.ts index 21d7b63..7564ec2 100644 --- a/src/js/compare/engine/compare-page-models.ts +++ b/src/js/compare/engine/compare-page-models.ts @@ -1,10 +1,49 @@ -import type { ComparePageModel, ComparePageResult } from '../types.ts'; +import type { + ComparePageModel, + ComparePageResult, + CompareCategorySummary, +} from '../types.ts'; import { diffTextRuns } from './diff-text-runs.ts'; +import { diffTextRunsAsync } from '../worker-api.ts'; +import { + classifyChangeCategory, + diffAnnotations, + diffImages, + buildCategorySummary, +} from './compare-content.ts'; + +const EMPTY_CATEGORY_SUMMARY: CompareCategorySummary = { + text: 0, + image: 0, + 'header-footer': 0, + annotation: 0, + formatting: 0, + background: 0, +}; export function comparePageModels( leftPage: ComparePageModel | null, rightPage: ComparePageModel | null ): ComparePageResult { + return comparePageModelsCore(leftPage, rightPage, false) as ComparePageResult; +} + +export function comparePageModelsAsync( + leftPage: ComparePageModel | null, + rightPage: ComparePageModel | null +): Promise { + return comparePageModelsCore( + leftPage, + rightPage, + true + ) as Promise; +} + +function comparePageModelsCore( + leftPage: ComparePageModel | null, + rightPage: ComparePageModel | null, + useWorker: boolean +): ComparePageResult | Promise { if (leftPage && !rightPage) { return { status: 'left-only', @@ -14,6 +53,7 @@ export function comparePageModels( { id: 'page-removed', type: 'page-removed', + category: 'text', description: `Page ${leftPage.pageNumber} exists only in the first PDF.`, beforeText: leftPage.plainText.slice(0, 200), afterText: '', @@ -21,7 +61,8 @@ export function comparePageModels( afterRects: [], }, ], - summary: { added: 0, removed: 1, modified: 0 }, + summary: { added: 0, removed: 1, modified: 0, moved: 0, styleChanged: 0 }, + categorySummary: { ...EMPTY_CATEGORY_SUMMARY, text: 1 }, visualDiff: null, usedOcr: leftPage.source === 'ocr', }; @@ -36,6 +77,7 @@ export function comparePageModels( { id: 'page-added', type: 'page-added', + category: 'text', description: `Page ${rightPage.pageNumber} exists only in the second PDF.`, beforeText: '', afterText: rightPage.plainText.slice(0, 200), @@ -43,7 +85,8 @@ export function comparePageModels( afterRects: [], }, ], - summary: { added: 1, removed: 0, modified: 0 }, + summary: { added: 1, removed: 0, modified: 0, moved: 0, styleChanged: 0 }, + categorySummary: { ...EMPTY_CATEGORY_SUMMARY, text: 1 }, visualDiff: null, usedOcr: rightPage.source === 'ocr', }; @@ -55,24 +98,57 @@ export function comparePageModels( leftPageNumber: null, rightPageNumber: null, changes: [], - summary: { added: 0, removed: 0, modified: 0 }, + summary: { added: 0, removed: 0, modified: 0, moved: 0, styleChanged: 0 }, + categorySummary: { ...EMPTY_CATEGORY_SUMMARY }, visualDiff: null, usedOcr: false, }; } - const { changes, summary } = diffTextRuns( - leftPage.textItems, - rightPage.textItems - ); + function buildResult(diff: { + changes: ComparePageResult['changes']; + summary: ComparePageResult['summary']; + }): ComparePageResult { + const allChanges = [...diff.changes]; + const pageHeight = Math.max(leftPage!.height, rightPage!.height); - return { - status: changes.length > 0 ? 'changed' : 'match', - leftPageNumber: leftPage.pageNumber, - rightPageNumber: rightPage.pageNumber, - changes, - summary, - visualDiff: null, - usedOcr: leftPage.source === 'ocr' || rightPage.source === 'ocr', - }; + for (const c of allChanges) { + if (c.category === 'text') { + c.category = classifyChangeCategory(c, pageHeight); + } + } + + const annotChanges = diffAnnotations( + leftPage!.annotations ?? [], + rightPage!.annotations ?? [], + allChanges.length + ); + allChanges.push(...annotChanges); + + const imageChanges = diffImages( + leftPage!.images ?? [], + rightPage!.images ?? [], + allChanges.length + ); + allChanges.push(...imageChanges); + + return { + status: allChanges.length > 0 ? 'changed' : 'match', + leftPageNumber: leftPage!.pageNumber, + rightPageNumber: rightPage!.pageNumber, + changes: allChanges, + summary: diff.summary, + categorySummary: buildCategorySummary(allChanges), + visualDiff: null, + usedOcr: leftPage!.source === 'ocr' || rightPage!.source === 'ocr', + }; + } + + if (useWorker) { + return diffTextRunsAsync(leftPage.textItems, rightPage.textItems).then( + buildResult + ); + } + + return buildResult(diffTextRuns(leftPage.textItems, rightPage.textItems)); } diff --git a/src/js/compare/engine/compare.worker.ts b/src/js/compare/engine/compare.worker.ts new file mode 100644 index 0000000..21bfc56 --- /dev/null +++ b/src/js/compare/engine/compare.worker.ts @@ -0,0 +1,77 @@ +import { diffTextRuns } from './diff-text-runs.ts'; +import { pairPages } from './pair-pages.ts'; +import type { + CompareTextItem, + ComparePageSignature, + ComparePagePair, + ComparePageResult, + CompareChangeSummary, + CompareTextChange, +} from '../types.ts'; + +interface DiffMessage { + type: 'diff'; + id: number; + beforeItems: CompareTextItem[]; + afterItems: CompareTextItem[]; +} + +interface PairMessage { + type: 'pair'; + id: number; + leftPages: ComparePageSignature[]; + rightPages: ComparePageSignature[]; +} + +type WorkerMessage = DiffMessage | PairMessage; + +interface DiffResult { + type: 'diff'; + id: number; + changes: CompareTextChange[]; + summary: CompareChangeSummary; +} + +interface PairResult { + type: 'pair'; + id: number; + pairs: ComparePagePair[]; +} + +interface ErrorResult { + type: 'error'; + id: number; + message: string; +} + +type WorkerResult = DiffResult | PairResult | ErrorResult; + +self.onmessage = function (e: MessageEvent) { + const msg = e.data; + try { + if (msg.type === 'diff') { + const { changes, summary } = diffTextRuns( + msg.beforeItems, + msg.afterItems + ); + const result: DiffResult = { + type: 'diff', + id: msg.id, + changes, + summary, + }; + self.postMessage(result); + } else if (msg.type === 'pair') { + const pairs = pairPages(msg.leftPages, msg.rightPages); + const result: PairResult = { type: 'pair', id: msg.id, pairs }; + self.postMessage(result); + } + } catch (err) { + const result: ErrorResult = { + type: 'error', + id: msg.id, + message: err instanceof Error ? err.message : String(err), + }; + self.postMessage(result); + } +}; diff --git a/src/js/compare/engine/diff-text-runs.ts b/src/js/compare/engine/diff-text-runs.ts index d5fa213..ac70ccc 100644 --- a/src/js/compare/engine/diff-text-runs.ts +++ b/src/js/compare/engine/diff-text-runs.ts @@ -8,13 +8,19 @@ import type { CompareTextItem, CompareWordToken, } from '../types.ts'; -import { calculateBoundingRect } from './text-normalization.ts'; +import { + calculateBoundingRect, + containsCJK, + segmentCJKText, +} from './text-normalization.ts'; import { COMPARE_GEOMETRY } from '../config.ts'; interface WordToken { word: string; compareWord: string; rect: CompareRectangle; + fontName?: string; + fontSize?: number; } function getCharMap(line: CompareTextItem): CharPosition[] { @@ -30,11 +36,15 @@ function getCharMap(line: CompareTextItem): CharPosition[] { function splitLineIntoWords(line: CompareTextItem): WordToken[] { if (line.wordTokens && line.wordTokens.length > 0) { - return line.wordTokens.map((token: CompareWordToken) => ({ + const baseTokens = line.wordTokens.map((token: CompareWordToken) => ({ word: token.word, compareWord: token.compareWord, rect: token.rect, + fontName: token.fontName, + fontSize: token.fontSize, })); + if (!containsCJK(line.normalizedText)) return baseTokens; + return baseTokens.flatMap(splitCJKToken); } const words = line.normalizedText.split(/\s+/).filter(Boolean); @@ -43,7 +53,7 @@ function splitLineIntoWords(line: CompareTextItem): WordToken[] { const charMap = getCharMap(line); let offset = 0; - return words.map((word) => { + const baseTokens = words.map((word) => { const startIndex = line.normalizedText.indexOf(word, offset); const endIndex = startIndex + word.length - 1; offset = startIndex + word.length; @@ -75,6 +85,31 @@ function splitLineIntoWords(line: CompareTextItem): WordToken[] { rect: { x, y: line.rect.y, width: w, height: line.rect.height }, }; }); + + if (!containsCJK(line.normalizedText)) return baseTokens; + return baseTokens.flatMap(splitCJKToken); +} + +function splitCJKToken(token: WordToken): WordToken[] { + if (!containsCJK(token.word)) return [token]; + + const segments = segmentCJKText(token.word); + if (segments.length <= 1) return [token]; + + const totalLen = token.word.length; + const charWidth = token.rect.width / Math.max(totalLen, 1); + let charOffset = 0; + + return segments.map((seg) => { + const x = token.rect.x + charOffset * charWidth; + const width = seg.length * charWidth; + charOffset += seg.length; + return { + word: seg, + compareWord: seg.toLowerCase(), + rect: { x, y: token.rect.y, width, height: token.rect.height }, + }; + }); } function groupAdjacentRects(rects: CompareRectangle[]): CompareRectangle[] { @@ -138,6 +173,7 @@ function createWordChange( changes.push({ id, type, + category: 'text', description: `Replaced "${beforeText}" with "${afterText}"`, beforeText, afterText, @@ -148,6 +184,7 @@ function createWordChange( changes.push({ id, type, + category: 'text', description: `Removed "${beforeText}"`, beforeText, afterText: '', @@ -158,6 +195,7 @@ function createWordChange( changes.push({ id, type, + category: 'text', description: `Added "${afterText}"`, beforeText: '', afterText, @@ -173,9 +211,11 @@ function toSummary(changes: CompareTextChange[]): CompareChangeSummary { if (change.type === 'added') summary.added += 1; if (change.type === 'removed') summary.removed += 1; if (change.type === 'modified') summary.modified += 1; + if (change.type === 'moved') summary.moved += 1; + if (change.type === 'style-changed') summary.styleChanged += 1; return summary; }, - { added: 0, removed: 0, modified: 0 } + { added: 0, removed: 0, modified: 0, moved: 0, styleChanged: 0 } ); } @@ -233,5 +273,202 @@ export function diffTextRuns( afterIndex += count; } + detectStyleChanges(changes, beforeWords, afterWords, rawChanges); + detectMovedText(changes); + return { changes, summary: toSummary(changes) }; } + +function normalizeFontName(name: string): string { + return name.replace(/^g_d\d+_/, 'g_d_'); +} + +function hasStyleDifference(before: WordToken, after: WordToken): boolean { + if ( + before.fontName && + after.fontName && + normalizeFontName(before.fontName) !== normalizeFontName(after.fontName) + ) + return true; + if ( + before.fontSize && + after.fontSize && + Math.abs(before.fontSize - after.fontSize) > 0.5 + ) + return true; + return false; +} + +function detectStyleChanges( + changes: CompareTextChange[], + beforeWords: WordToken[], + afterWords: WordToken[], + rawChanges: ReturnType> +) { + interface StyleFragment { + bFont: string; + aFont: string; + bSize: number | undefined; + aSize: number | undefined; + text: string; + beforeRects: CompareRectangle[]; + afterRects: CompareRectangle[]; + } + + const fragments: StyleFragment[] = []; + let beforeIdx = 0; + let afterIdx = 0; + + for (const change of rawChanges) { + const count = change.value.length; + if (change.removed) { + beforeIdx += count; + continue; + } + if (change.added) { + afterIdx += count; + continue; + } + + let styleRunStart = -1; + for (let k = 0; k < count; k++) { + const bw = beforeWords[beforeIdx + k]; + const aw = afterWords[afterIdx + k]; + const isDiff = hasStyleDifference(bw, aw); + + if (isDiff && styleRunStart < 0) { + styleRunStart = k; + } + if ((!isDiff || k === count - 1) && styleRunStart >= 0) { + const end = isDiff ? k + 1 : k; + const bTokens = beforeWords.slice( + beforeIdx + styleRunStart, + beforeIdx + end + ); + const aTokens = afterWords.slice( + afterIdx + styleRunStart, + afterIdx + end + ); + fragments.push({ + bFont: bTokens[0].fontName ?? 'unknown', + aFont: aTokens[0].fontName ?? 'unknown', + bSize: bTokens[0].fontSize, + aSize: aTokens[0].fontSize, + text: bTokens.map((w) => w.word).join(' '), + beforeRects: groupAdjacentRects(bTokens.map((w) => w.rect)), + afterRects: groupAdjacentRects(aTokens.map((w) => w.rect)), + }); + styleRunStart = -1; + } + } + + beforeIdx += count; + afterIdx += count; + } + + const groups = new Map(); + for (const frag of fragments) { + const key = `${frag.bFont}→${frag.aFont}|${frag.bSize ?? ''}→${frag.aSize ?? ''}`; + const arr = groups.get(key); + if (arr) arr.push(frag); + else groups.set(key, [frag]); + } + + for (const groupFrags of groups.values()) { + const bFont = groupFrags[0].bFont; + const aFont = groupFrags[0].aFont; + const bSize = groupFrags[0].bSize; + const aSize = groupFrags[0].aSize; + const allText = groupFrags.map((f) => f.text).join(' … '); + const allBeforeRects = groupFrags.flatMap((f) => f.beforeRects); + const allAfterRects = groupFrags.flatMap((f) => f.afterRects); + + let desc = `Style changed (${groupFrags.length} regions)`; + const details: string[] = []; + if (bFont !== aFont) details.push(`Font: ${bFont} → ${aFont}`); + if (bSize && aSize && Math.abs(bSize - aSize) > 0.5) + details.push(`Font size: ${bSize} → ${aSize}`); + if (details.length) desc += '\n' + details.map((d) => `• ${d}`).join('\n'); + + changes.push({ + id: `style-changed-${changes.length}`, + type: 'style-changed', + category: 'formatting', + description: desc, + beforeText: allText, + afterText: allText, + beforeRects: allBeforeRects, + afterRects: allAfterRects, + }); + } +} + +const MOVE_MIN_WORDS = 3; +const MOVE_SIMILARITY_THRESHOLD = 0.8; + +function normalizeForMove(text: string): string { + return text.toLowerCase().replace(/\s+/g, ' ').trim(); +} + +function moveSimilarity(a: string, b: string): number { + if (a === b) return 1; + if (!a || !b) return 0; + const aWords = a.split(' '); + const bWords = b.split(' '); + const bSet = new Set(bWords); + let matches = 0; + for (const w of aWords) { + if (bSet.has(w)) matches++; + } + return matches / Math.max(aWords.length, bWords.length); +} + +function detectMovedText(changes: CompareTextChange[]) { + const removed = changes.filter((c) => c.type === 'removed'); + const added = changes.filter((c) => c.type === 'added'); + if (removed.length === 0 || added.length === 0) return; + + const matchedRemoved = new Set(); + const matchedAdded = new Set(); + + for (const rem of removed) { + const remNorm = normalizeForMove(rem.beforeText); + const remWordCount = remNorm.split(' ').length; + if (remWordCount < MOVE_MIN_WORDS) continue; + + let bestMatch: CompareTextChange | null = null; + let bestScore = MOVE_SIMILARITY_THRESHOLD; + + for (const add of added) { + if (matchedAdded.has(add.id)) continue; + const addNorm = normalizeForMove(add.afterText); + const score = moveSimilarity(remNorm, addNorm); + if (score > bestScore) { + bestScore = score; + bestMatch = add; + } + } + + if (bestMatch) { + matchedRemoved.add(rem.id); + matchedAdded.add(bestMatch.id); + + changes.push({ + id: `moved-${changes.length}`, + type: 'moved', + category: 'text', + description: `Moved "${rem.beforeText.slice(0, 80)}"`, + beforeText: rem.beforeText, + afterText: bestMatch.afterText, + beforeRects: rem.beforeRects, + afterRects: bestMatch.afterRects, + }); + } + } + + for (let i = changes.length - 1; i >= 0; i--) { + if (matchedRemoved.has(changes[i].id) || matchedAdded.has(changes[i].id)) { + changes.splice(i, 1); + } + } +} diff --git a/src/js/compare/engine/extract-page-model.ts b/src/js/compare/engine/extract-page-model.ts index d1938fd..434e463 100644 --- a/src/js/compare/engine/extract-page-model.ts +++ b/src/js/compare/engine/extract-page-model.ts @@ -1,6 +1,8 @@ import * as pdfjsLib from 'pdfjs-dist'; import type { + CompareAnnotation, + CompareImageRef, ComparePageModel, CompareTextItem, CharPosition, @@ -9,6 +11,8 @@ import type { import { joinCompareTextItems, normalizeCompareText, + containsCJK, + segmentCJKText, } from './text-normalization.ts'; type PageTextItem = { @@ -69,11 +73,14 @@ function measureTextWidth(fontSpec: string, text: string): number { return width; } +type FontNameMap = Map; + function buildItemWordTokens( viewport: pdfjsLib.PageViewport, item: PageTextItem, fallbackRect: CompareTextItem['rect'], - styles: TextStyles + styles: TextStyles, + fontNameMap: FontNameMap ): CompareWordToken[] { const rawText = item.str || ''; if (!rawText.trim()) { @@ -216,19 +223,47 @@ function buildItemWordTokens( (previousToken ? shouldJoinTokenWithPrevious(previousToken.word, normalizedWord) : false), + fontName: fontNameMap.get(item.fontName) ?? item.fontName ?? undefined, + fontSize: fontScale > 0 ? Math.round(fontScale * 100) / 100 : undefined, }); previousEnd = endIndex; } - return tokens; + if (!containsCJK(rawText)) return tokens; + return tokens.flatMap(splitCJKWordToken); +} + +function splitCJKWordToken(token: CompareWordToken): CompareWordToken[] { + if (!containsCJK(token.word)) return [token]; + const segments = segmentCJKText(token.word); + if (segments.length <= 1) return [token]; + + const totalLen = token.word.length; + const charWidth = token.rect.width / Math.max(totalLen, 1); + let charOffset = 0; + + return segments.map((seg, i) => { + const x = token.rect.x + charOffset * charWidth; + const width = seg.length * charWidth; + charOffset += seg.length; + return { + word: seg, + compareWord: seg.toLowerCase(), + rect: { x, y: token.rect.y, width, height: token.rect.height }, + joinsWithPrevious: i > 0 ? true : token.joinsWithPrevious, + fontName: token.fontName, + fontSize: token.fontSize, + }; + }); } function toRect( viewport: pdfjsLib.PageViewport, item: PageTextItem, index: number, - styles: TextStyles + styles: TextStyles, + fontNameMap: FontNameMap ) { const normalizedText = normalizeCompareText(item.str); @@ -256,7 +291,7 @@ function toRect( text: item.str, normalizedText, rect, - wordTokens: buildItemWordTokens(viewport, item, rect, styles), + wordTokens: buildItemWordTokens(viewport, item, rect, styles, fontNameMap), } satisfies CompareTextItem; } @@ -387,6 +422,8 @@ function mergeWordTokenRects( width: maxX - minX, height: maxY - minY, }, + fontName: left.fontName, + fontSize: left.fontSize, }; } @@ -431,6 +468,8 @@ function buildMergedWordTokens(lineItems: CompareTextItem[]) { word: token.word, compareWord: token.compareWord, rect: token.rect, + fontName: token.fontName, + fontSize: token.fontSize, }); } }); @@ -496,16 +535,131 @@ export function mergeIntoLines( }); } +function extractAnnotations( + rawAnnotations: Array>, + viewport: pdfjsLib.PageViewport +): CompareAnnotation[] { + return rawAnnotations + .filter((ann) => { + const subtype = ann.subtype as string | undefined; + return subtype && subtype !== 'Link' && subtype !== 'Widget'; + }) + .map((ann, index) => { + const rawRect = ann.rect as number[] | undefined; + let rect = { x: 0, y: 0, width: 0, height: 0 }; + if (rawRect && rawRect.length === 4) { + const [p1, p2] = [ + viewport.convertToViewportPoint(rawRect[0], rawRect[1]), + viewport.convertToViewportPoint(rawRect[2], rawRect[3]), + ]; + const x = Math.min(p1[0], p2[0]); + const y = Math.min(p1[1], p2[1]); + rect = { + x, + y, + width: Math.max(Math.abs(p2[0] - p1[0]), 1), + height: Math.max(Math.abs(p2[1] - p1[1]), 1), + }; + } + const color = ann.color as number[] | undefined; + return { + id: `ann-${index}`, + subtype: (ann.subtype as string) || 'Unknown', + rect, + contents: ((ann.contents as string) || '').trim(), + title: ((ann.title as string) || '').trim(), + color: color ? `rgb(${color.join(',')})` : '', + }; + }); +} + +function extractImages( + opList: { fnArray: number[]; argsArray: unknown[][] }, + viewport: pdfjsLib.PageViewport +): CompareImageRef[] { + const OPS_PAINT_IMAGE = 85; + const OPS_PAINT_INLINE_IMAGE = 84; + const images: CompareImageRef[] = []; + + for (let i = 0; i < opList.fnArray.length; i++) { + const op = opList.fnArray[i]; + if (op !== OPS_PAINT_IMAGE && op !== OPS_PAINT_INLINE_IMAGE) continue; + + const args = opList.argsArray[i]; + if (!args) continue; + + let imgWidth = 0; + let imgHeight = 0; + + if (op === OPS_PAINT_INLINE_IMAGE && args[0]) { + const imgData = args[0] as Record; + imgWidth = (imgData.width as number) || 0; + imgHeight = (imgData.height as number) || 0; + } else if (op === OPS_PAINT_IMAGE) { + imgWidth = (args[1] as number) || 0; + imgHeight = (args[2] as number) || 0; + } + + if (imgWidth < 2 || imgHeight < 2) continue; + + const [vpX, vpY] = viewport.convertToViewportPoint(0, 0); + const [vpX2, vpY2] = viewport.convertToViewportPoint(imgWidth, imgHeight); + const x = Math.min(vpX, vpX2); + const y = Math.min(vpY, vpY2); + + images.push({ + id: `img-${images.length}`, + rect: { + x, + y, + width: Math.abs(vpX2 - vpX) || imgWidth, + height: Math.abs(vpY2 - vpY) || imgHeight, + }, + width: imgWidth, + height: imgHeight, + }); + } + + return images; +} + export async function extractPageModel( page: pdfjsLib.PDFPageProxy, viewport: pdfjsLib.PageViewport ): Promise { - const textContent = await page.getTextContent(); + const [textContent, rawAnnotations, opList] = await Promise.all([ + page.getTextContent(), + page + .getAnnotations({ intent: 'any' }) + .catch(() => [] as Array>), + page + .getOperatorList() + .catch(() => ({ fnArray: [] as number[], argsArray: [] as unknown[][] })), + ]); const styles = textContent.styles ?? {}; + + const fontNameMap: FontNameMap = new Map(); + const seenFonts = new Set(); + for (const item of textContent.items) { + if ('fontName' in item && typeof item.fontName === 'string') { + seenFonts.add(item.fontName); + } + } + for (const internalName of seenFonts) { + try { + if (page.commonObjs.has(internalName)) { + const fontObj = page.commonObjs.get(internalName); + if (fontObj?.name && typeof fontObj.name === 'string') { + fontNameMap.set(internalName, fontObj.name); + } + } + } catch {} + } + const rawItems = sortCompareTextItems( textContent.items .filter((item): item is PageTextItem => 'str' in item) - .map((item, index) => toRect(viewport, item, index, styles)) + .map((item, index) => toRect(viewport, item, index, styles, fontNameMap)) .filter((item) => item.normalizedText.length > 0) ); const textItems = mergeIntoLines(rawItems); @@ -518,5 +672,13 @@ export async function extractPageModel( plainText: joinCompareTextItems(textItems), hasText: textItems.length > 0, source: 'pdfjs', + annotations: extractAnnotations( + rawAnnotations as Array>, + viewport + ), + images: extractImages( + opList as { fnArray: number[]; argsArray: unknown[][] }, + viewport + ), }; } diff --git a/src/js/compare/engine/text-normalization.ts b/src/js/compare/engine/text-normalization.ts index 6bc9e1b..896b0c8 100644 --- a/src/js/compare/engine/text-normalization.ts +++ b/src/js/compare/engine/text-normalization.ts @@ -71,6 +71,32 @@ export function tokenizeTextAsSet(text: string): Set { return new Set(tokenizeText(text)); } +const CJK_REGEX = + /[\u2E80-\u9FFF\uF900-\uFAFF\uFE30-\uFE4F\u{20000}-\u{2FA1F}]/u; + +export function containsCJK(text: string): boolean { + return CJK_REGEX.test(text); +} + +let cachedSegmenter: Intl.Segmenter | null = null; + +function getWordSegmenter(): Intl.Segmenter | null { + if (cachedSegmenter) return cachedSegmenter; + if (typeof Intl !== 'undefined' && Intl.Segmenter) { + cachedSegmenter = new Intl.Segmenter(undefined, { granularity: 'word' }); + return cachedSegmenter; + } + return null; +} + +export function segmentCJKText(text: string): string[] { + const segmenter = getWordSegmenter(); + if (!segmenter) return [text]; + return [...segmenter.segment(text)] + .filter((seg) => seg.isWordLike) + .map((seg) => seg.segment); +} + export function calculateBoundingRect( rects: CompareRectangle[] ): CompareRectangle { diff --git a/src/js/compare/reporting/export-compare-pdf.ts b/src/js/compare/reporting/export-compare-pdf.ts index 0e66cff..3e56afc 100644 --- a/src/js/compare/reporting/export-compare-pdf.ts +++ b/src/js/compare/reporting/export-compare-pdf.ts @@ -6,7 +6,7 @@ import type { ComparePdfExportMode, } from '../types.ts'; import { extractPageModel } from '../engine/extract-page-model.ts'; -import { comparePageModels } from '../engine/compare-page-models.ts'; +import { comparePageModelsAsync } from '../engine/compare-page-models.ts'; import { COMPARE_COLORS, HIGHLIGHT_OPACITY, @@ -42,6 +42,18 @@ const HIGHLIGHT_COLORS: Record< b: COMPARE_COLORS.modified.b / 255, opacity: HIGHLIGHT_OPACITY, }, + moved: { + r: COMPARE_COLORS.moved.r / 255, + g: COMPARE_COLORS.moved.g / 255, + b: COMPARE_COLORS.moved.b / 255, + opacity: HIGHLIGHT_OPACITY, + }, + 'style-changed': { + r: COMPARE_COLORS['style-changed'].r / 255, + g: COMPARE_COLORS['style-changed'].g / 255, + b: COMPARE_COLORS['style-changed'].b / 255, + opacity: HIGHLIGHT_OPACITY, + }, }; const EXTRACT_SCALE = COMPARE_RENDER.EXPORT_EXTRACT_SCALE; @@ -124,7 +136,7 @@ export async function exportComparePdf( ) : null; - const comparison = comparePageModels(leftModel, rightModel); + const comparison = await comparePageModelsAsync(leftModel, rightModel); const changes = comparison.changes; if (mode === 'split') { diff --git a/src/js/compare/types.ts b/src/js/compare/types.ts index 30b7f71..d5d7a09 100644 --- a/src/js/compare/types.ts +++ b/src/js/compare/types.ts @@ -22,16 +22,24 @@ export interface DiffFocusRegion { height: number; } +export interface OcrCacheEntry { + model: ComparePageModel; + width: number; + height: number; +} + export interface CompareCaches { pageModelCache: LRUCache; comparisonCache: LRUCache; comparisonResultsCache: LRUCache; + ocrModelCache: LRUCache; } export interface CompareRenderContext { useOcr: boolean; ocrLanguage: string; viewMode: CompareViewMode; + zoomLevel: number; showLoader: (message: string, percent?: number) => void; } @@ -52,6 +60,8 @@ export interface CompareWordToken { compareWord: string; rect: CompareRectangle; joinsWithPrevious?: boolean; + fontName?: string; + fontSize?: number; } export interface CompareTextItem { @@ -72,6 +82,24 @@ export interface ComparePageModel { plainText: string; hasText: boolean; source: 'pdfjs' | 'ocr'; + annotations?: CompareAnnotation[]; + images?: CompareImageRef[]; +} + +export interface CompareAnnotation { + id: string; + subtype: string; + rect: CompareRectangle; + contents: string; + title: string; + color: string; +} + +export interface CompareImageRef { + id: string; + rect: CompareRectangle; + width: number; + height: number; } export interface ComparePageSignature { @@ -98,12 +126,23 @@ export type CompareChangeType = | 'added' | 'removed' | 'modified' + | 'moved' + | 'style-changed' | 'page-added' | 'page-removed'; +export type CompareContentCategory = + | 'text' + | 'image' + | 'header-footer' + | 'annotation' + | 'formatting' + | 'background'; + export interface CompareTextChange { id: string; type: CompareChangeType; + category: CompareContentCategory; description: string; beforeText: string; afterText: string; @@ -115,6 +154,17 @@ export interface CompareChangeSummary { added: number; removed: number; modified: number; + moved: number; + styleChanged: number; +} + +export interface CompareCategorySummary { + text: number; + image: number; + 'header-footer': number; + annotation: number; + formatting: number; + background: number; } export interface ComparePageResult { @@ -123,12 +173,28 @@ export interface ComparePageResult { rightPageNumber: number | null; changes: CompareTextChange[]; summary: CompareChangeSummary; + categorySummary: CompareCategorySummary; visualDiff: CompareVisualDiff | null; confidence?: number; usedOcr?: boolean; } -export type CompareFilterType = 'added' | 'removed' | 'modified' | 'all'; +export type CompareFilterType = + | 'added' + | 'removed' + | 'modified' + | 'moved' + | 'style-changed' + | 'all'; + +export interface CompareCategoryFilterState { + text: boolean; + image: boolean; + 'header-footer': boolean; + annotation: boolean; + formatting: boolean; + background: boolean; +} export interface CompareState { pdfDoc1: pdfjsLib.PDFDocumentProxy | null; @@ -140,7 +206,9 @@ export interface CompareState { activeChangeIndex: number; pagePairs: ComparePagePair[]; activeFilter: CompareFilterType; + categoryFilter: CompareCategoryFilterState; changeSearchQuery: string; useOcr: boolean; ocrLanguage: string; + zoomLevel: number; } diff --git a/src/js/compare/worker-api.ts b/src/js/compare/worker-api.ts new file mode 100644 index 0000000..919a176 --- /dev/null +++ b/src/js/compare/worker-api.ts @@ -0,0 +1,90 @@ +import type { + CompareTextItem, + ComparePageSignature, + ComparePagePair, + CompareChangeSummary, + CompareTextChange, +} from './types.ts'; +import { diffTextRuns } from './engine/diff-text-runs.ts'; +import { pairPages } from './engine/pair-pages.ts'; + +let worker: Worker | null = null; +let messageId = 0; +const pending = new Map< + number, + { resolve: (value: unknown) => void; reject: (reason: unknown) => void } +>(); + +function getWorker(): Worker | null { + if (worker) return worker; + try { + worker = new Worker( + new URL('./engine/compare.worker.ts', import.meta.url), + { type: 'module' } + ); + worker.onmessage = function (e) { + const { id, type, ...rest } = e.data; + const p = pending.get(id); + if (!p) return; + pending.delete(id); + if (type === 'error') { + p.reject(new Error((rest as { message: string }).message)); + } else { + p.resolve(rest); + } + }; + worker.onerror = function () { + worker?.terminate(); + worker = null; + for (const [, p] of pending) { + p.reject(new Error('Worker crashed')); + } + pending.clear(); + }; + return worker; + } catch { + return null; + } +} + +function postToWorker(msg: Record): Promise { + const w = getWorker(); + if (!w) return Promise.reject(new Error('No worker')); + const id = ++messageId; + return new Promise((resolve, reject) => { + pending.set(id, { resolve, reject }); + w.postMessage({ ...msg, id }); + }); +} + +export async function diffTextRunsAsync( + beforeItems: CompareTextItem[], + afterItems: CompareTextItem[] +): Promise<{ changes: CompareTextChange[]; summary: CompareChangeSummary }> { + try { + const result = (await postToWorker({ + type: 'diff', + beforeItems, + afterItems, + })) as { changes: CompareTextChange[]; summary: CompareChangeSummary }; + return result; + } catch { + return diffTextRuns(beforeItems, afterItems); + } +} + +export async function pairPagesAsync( + leftPages: ComparePageSignature[], + rightPages: ComparePageSignature[] +): Promise { + try { + const result = (await postToWorker({ + type: 'pair', + leftPages, + rightPages, + })) as { pairs: ComparePagePair[] }; + return result.pairs; + } catch { + return pairPages(leftPages, rightPages); + } +} diff --git a/src/js/logic/compare-pdfs-page.ts b/src/js/logic/compare-pdfs-page.ts index 2253e65..2e267aa 100644 --- a/src/js/logic/compare-pdfs-page.ts +++ b/src/js/logic/compare-pdfs-page.ts @@ -7,9 +7,10 @@ import type { CompareFilterType, ComparePageResult, CompareTextChange, + CompareCategoryFilterState, } from '../compare/types.ts'; import { extractDocumentSignatures } from '../compare/engine/page-signatures.ts'; -import { pairPages } from '../compare/engine/pair-pages.ts'; +import { pairPagesAsync } from '../compare/worker-api.ts'; import type { ComparePdfExportMode, CompareCaches, @@ -39,15 +40,25 @@ const pageState: CompareState = { activeChangeIndex: 0, pagePairs: [], activeFilter: 'all', + categoryFilter: { + text: true, + image: true, + 'header-footer': true, + annotation: true, + formatting: true, + background: true, + }, changeSearchQuery: '', useOcr: true, ocrLanguage: 'eng', + zoomLevel: 1.0, }; const caches: CompareCaches = { pageModelCache: new LRUCache(COMPARE_CACHE_MAX_SIZE), comparisonCache: new LRUCache(COMPARE_CACHE_MAX_SIZE), comparisonResultsCache: new LRUCache(COMPARE_CACHE_MAX_SIZE), + ocrModelCache: new LRUCache(COMPARE_CACHE_MAX_SIZE), }; const documentNames = { left: 'first.pdf', @@ -65,6 +76,7 @@ function getRenderContext(): CompareRenderContext { useOcr: pageState.useOcr, ocrLanguage: pageState.ocrLanguage, viewMode: pageState.viewMode, + zoomLevel: pageState.zoomLevel, showLoader, }; } @@ -79,15 +91,22 @@ function getVisibleChanges(result: ComparePageResult | null) { if (pageState.activeFilter === 'removed') { return change.type === 'removed' || change.type === 'page-removed'; } + if (pageState.activeFilter === 'added') { + return change.type === 'added' || change.type === 'page-added'; + } return change.type === pageState.activeFilter; }); + const filteredByCategory = filteredByType.filter( + (change) => pageState.categoryFilter[change.category] + ); + const searchQuery = pageState.changeSearchQuery.trim().toLowerCase(); if (!searchQuery) { - return filteredByType; + return filteredByCategory; } - return filteredByType.filter((change) => { + return filteredByCategory.filter((change) => { const searchableText = [ change.description, change.beforeText, @@ -104,6 +123,8 @@ function updateFilterButtons() { { id: 'filter-modified', filter: 'modified' }, { id: 'filter-added', filter: 'added' }, { id: 'filter-removed', filter: 'removed' }, + { id: 'filter-moved', filter: 'moved' }, + { id: 'filter-style-changed', filter: 'style-changed' }, ]; pills.forEach(({ id, filter }) => { @@ -118,6 +139,10 @@ function updateSummary() { const addedCount = getElement('summary-added-count'); const removedCount = getElement('summary-removed-count'); const modifiedCount = getElement('summary-modified-count'); + const movedCount = getElement('summary-moved-count'); + const styleChangedCount = getElement( + 'summary-style-changed-count' + ); const panelLabel1 = getElement('compare-panel-label-1'); const panelLabel2 = getElement('compare-panel-label-2'); @@ -128,6 +153,9 @@ function updateSummary() { if (addedCount) addedCount.textContent = '0'; if (removedCount) removedCount.textContent = '0'; if (modifiedCount) modifiedCount.textContent = '0'; + if (movedCount) movedCount.textContent = '0'; + if (styleChangedCount) styleChangedCount.textContent = '0'; + updateCategoryPills(null); return; } @@ -136,6 +164,34 @@ function updateSummary() { removedCount.textContent = comparison.summary.removed.toString(); if (modifiedCount) modifiedCount.textContent = comparison.summary.modified.toString(); + if (movedCount) movedCount.textContent = comparison.summary.moved.toString(); + if (styleChangedCount) + styleChangedCount.textContent = comparison.summary.styleChanged.toString(); + + updateCategoryPills(comparison); +} + +function updateCategoryPills(comparison: ComparePageResult | null) { + const categoryKeys: Array = [ + 'text', + 'image', + 'header-footer', + 'annotation', + 'formatting', + 'background', + ]; + + const summary = comparison?.categorySummary; + + for (const key of categoryKeys) { + const countEl = getElement(`category-count-${key}`); + const pill = getElement(`category-${key}`); + if (countEl) countEl.textContent = summary ? summary[key].toString() : '0'; + if (pill) { + pill.classList.toggle('active', pageState.categoryFilter[key]); + pill.classList.toggle('disabled', !pageState.categoryFilter[key]); + } + } } function renderHighlights() { @@ -233,26 +289,60 @@ function renderChangeList() { emptyState.classList.add('hidden'); list.classList.remove('hidden'); + const typeLabels: Record = { + added: 'Added', + removed: 'Deleted', + modified: 'Modified', + moved: 'Moved', + 'style-changed': 'Style Changed', + 'page-added': 'Page Added', + 'page-removed': 'Page Removed', + }; + + const grouped = new Map< + string, + Array<{ change: CompareTextChange; index: number }> + >(); visibleChanges.forEach((change, index) => { - const item = document.createElement('div'); - item.className = `compare-change-item${index === pageState.activeChangeIndex ? ' active' : ''}`; - item.innerHTML = ` - -
-
${change.description}
-
- ${change.type.replace('-', ' ')} - `; - - item.addEventListener('click', function () { - pageState.activeChangeIndex = index; - renderComparisonUI(); - scrollToChange(change); - }); - - list.appendChild(item); + const key = change.type; + if (!grouped.has(key)) grouped.set(key, []); + grouped.get(key)!.push({ change, index }); }); + for (const [type, entries] of grouped) { + const header = document.createElement('div'); + header.className = 'compare-section-header'; + header.innerHTML = ` + + ${entries.length} + + `; + list.appendChild(header); + + const arrowSvg = + ''; + + for (const { change, index } of entries) { + const item = document.createElement('div'); + item.className = `compare-change-item${index === pageState.activeChangeIndex ? ' active' : ''}`; + const safeDesc = change.description + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/\n/g, '
') + .replace(/→/g, arrowSvg); + item.innerHTML = `
${safeDesc}
`; + + item.addEventListener('click', function () { + pageState.activeChangeIndex = index; + renderComparisonUI(); + scrollToChange(change); + }); + + list.appendChild(item); + } + } + prevChangeBtn.disabled = false; nextChangeBtn.disabled = false; exportDropdownBtn.disabled = pageState.pagePairs.length === 0; @@ -289,7 +379,7 @@ async function buildPagePairs() { } ); - pageState.pagePairs = pairPages(leftSignatures, rightSignatures); + pageState.pagePairs = await pairPagesAsync(leftSignatures, rightSignatures); pageState.currentPage = 1; } @@ -668,6 +758,8 @@ document.addEventListener('DOMContentLoaded', function () { { id: 'filter-modified', filter: 'modified' }, { id: 'filter-added', filter: 'added' }, { id: 'filter-removed', filter: 'removed' }, + { id: 'filter-moved', filter: 'moved' }, + { id: 'filter-style-changed', filter: 'style-changed' }, ]; if (syncToggle) { @@ -725,6 +817,59 @@ document.addEventListener('DOMContentLoaded', function () { }); } + const ZOOM_STEP = 0.25; + const ZOOM_MIN = 0.25; + const ZOOM_MAX = 5.0; + const zoomInBtn = getElement('zoom-in-btn'); + const zoomOutBtn = getElement('zoom-out-btn'); + const zoomResetBtn = getElement('zoom-reset-btn'); + const zoomDisplay = getElement('zoom-level-display'); + + function updateZoomDisplay() { + if (zoomDisplay) { + zoomDisplay.textContent = `${Math.round(pageState.zoomLevel * 100)}%`; + } + if (zoomOutBtn) zoomOutBtn.disabled = pageState.zoomLevel <= ZOOM_MIN; + if (zoomInBtn) zoomInBtn.disabled = pageState.zoomLevel >= ZOOM_MAX; + } + + function applyZoom() { + updateZoomDisplay(); + caches.pageModelCache.clear(); + caches.comparisonCache.clear(); + caches.comparisonResultsCache.clear(); + if (pageState.pdfDoc1 && pageState.pdfDoc2) { + renderBothPages().catch(console.error); + } + } + + if (zoomInBtn) { + zoomInBtn.addEventListener('click', function () { + pageState.zoomLevel = Math.min( + Math.round((pageState.zoomLevel + ZOOM_STEP) * 100) / 100, + ZOOM_MAX + ); + applyZoom(); + }); + } + + if (zoomOutBtn) { + zoomOutBtn.addEventListener('click', function () { + pageState.zoomLevel = Math.max( + Math.round((pageState.zoomLevel - ZOOM_STEP) * 100) / 100, + ZOOM_MIN + ); + applyZoom(); + }); + } + + if (zoomResetBtn) { + zoomResetBtn.addEventListener('click', function () { + pageState.zoomLevel = 1.0; + applyZoom(); + }); + } + filterButtons.forEach(({ id, filter }) => { const button = getElement(id); if (!button) return; @@ -739,6 +884,26 @@ document.addEventListener('DOMContentLoaded', function () { }); }); + const categoryKeys: Array = [ + 'text', + 'image', + 'header-footer', + 'annotation', + 'formatting', + 'background', + ]; + + for (const key of categoryKeys) { + const pill = getElement(`category-${key}`); + if (pill) { + pill.addEventListener('click', function () { + pageState.categoryFilter[key] = !pageState.categoryFilter[key]; + pageState.activeChangeIndex = 0; + renderComparisonUI(); + }); + } + } + if (ocrToggle) { ocrToggle.checked = pageState.useOcr; ocrToggle.addEventListener('change', async function () { diff --git a/src/js/logic/compare-render.ts b/src/js/logic/compare-render.ts index 1ebbdc0..ffdc2c9 100644 --- a/src/js/logic/compare-render.ts +++ b/src/js/logic/compare-render.ts @@ -3,6 +3,9 @@ import type { ComparePageModel, ComparePagePair, ComparePageResult, + CompareRectangle, + CompareWordToken, + CompareTextItem, RenderedPage, ComparisonPageLoad, DiffFocusRegion, @@ -10,7 +13,7 @@ import type { CompareRenderContext, } from '../compare/types.ts'; import { extractPageModel } from '../compare/engine/extract-page-model.ts'; -import { comparePageModels } from '../compare/engine/compare-page-models.ts'; +import { comparePageModelsAsync } from '../compare/engine/compare-page-models.ts'; import { renderVisualDiff } from '../compare/engine/visual-diff.ts'; import { recognizePageCanvas } from '../compare/engine/ocr-page.ts'; import { isLowQualityExtractedText } from '../compare/engine/text-normalization.ts'; @@ -48,7 +51,8 @@ export function hidePlaceholder(placeholderId: string) { export function getRenderScale( page: pdfjsLib.PDFPageProxy, container: HTMLElement, - viewMode: 'overlay' | 'side-by-side' + viewMode: 'overlay' | 'side-by-side', + zoomLevel = 1.0 ) { const baseViewport = page.getViewport({ scale: 1.0 }); const availableWidth = Math.max( @@ -61,7 +65,8 @@ export function getRenderScale( ? COMPARE_RENDER.MAX_SCALE_OVERLAY : COMPARE_RENDER.MAX_SCALE_SIDE; - return Math.min(Math.max(fitScale, 1.0), maxScale); + const baseScale = Math.min(Math.max(fitScale, 1.0), maxScale); + return baseScale * zoomLevel; } export function getPageModelCacheKey( @@ -76,6 +81,72 @@ function shouldUseOcrForModel(model: ComparePageModel) { return !model.hasText || isLowQualityExtractedText(model.plainText); } +function rescaleRect( + rect: CompareRectangle, + scaleX: number, + scaleY: number +): CompareRectangle { + return { + x: rect.x * scaleX, + y: rect.y * scaleY, + width: rect.width * scaleX, + height: rect.height * scaleY, + }; +} + +function rescaleWordToken( + token: CompareWordToken, + scaleX: number, + scaleY: number +): CompareWordToken { + return { + ...token, + rect: rescaleRect(token.rect, scaleX, scaleY), + }; +} + +function rescaleTextItem( + item: CompareTextItem, + scaleX: number, + scaleY: number +): CompareTextItem { + return { + ...item, + rect: rescaleRect(item.rect, scaleX, scaleY), + charMap: item.charMap?.map((c) => ({ + x: c.x * scaleX, + width: c.width * scaleX, + })), + wordTokens: item.wordTokens?.map((t) => + rescaleWordToken(t, scaleX, scaleY) + ), + fragments: item.fragments?.map((f) => rescaleTextItem(f, scaleX, scaleY)), + }; +} + +function rescalePageModel( + model: ComparePageModel, + cachedWidth: number, + cachedHeight: number, + targetWidth: number, + targetHeight: number +): ComparePageModel { + const scaleX = targetWidth / Math.max(cachedWidth, 1); + const scaleY = targetHeight / Math.max(cachedHeight, 1); + return { + ...model, + width: targetWidth, + height: targetHeight, + textItems: model.textItems.map((item) => + rescaleTextItem(item, scaleX, scaleY) + ), + }; +} + +function getOcrCacheKey(side: string, pageNum: number) { + return `${side}-${pageNum}`; +} + export function buildDiffFocusRegion( comparison: ComparePageResult, leftCanvas: HTMLCanvasElement, @@ -164,7 +235,12 @@ export async function renderPage( const page = await pdfDoc.getPage(pageNum); - const targetScale = getRenderScale(page, container, ctx.viewMode); + const targetScale = getRenderScale( + page, + container, + ctx.viewMode, + ctx.zoomLevel + ); const scaledViewport = page.getViewport({ scale: targetScale }); const dpr = window.devicePixelRatio || 1; const hiResViewport = page.getViewport({ scale: targetScale * dpr }); @@ -192,18 +268,36 @@ export async function renderPage( let finalModel = model; if (!cachedModel && ctx.useOcr && shouldUseOcrForModel(model)) { - ctx.showLoader(`Running OCR on page ${pageNum}...`); - const ocrModel = await recognizePageCanvas( - canvas, - ctx.ocrLanguage, - function (status, progress) { - ctx.showLoader(`OCR: ${status}`, progress * 100); - } - ); - finalModel = { - ...ocrModel, - pageNumber: pageNum, - }; + const ocrKey = getOcrCacheKey(cacheKeyPrefix, pageNum); + const cachedOcr = caches.ocrModelCache.get(ocrKey); + if (cachedOcr) { + finalModel = rescalePageModel( + cachedOcr.model, + cachedOcr.width, + cachedOcr.height, + scaledViewport.width, + scaledViewport.height + ); + finalModel.pageNumber = pageNum; + } else { + ctx.showLoader(`Running OCR on page ${pageNum}...`); + const ocrModel = await recognizePageCanvas( + canvas, + ctx.ocrLanguage, + function (status, progress) { + ctx.showLoader(`OCR: ${status}`, progress * 100); + } + ); + finalModel = { + ...ocrModel, + pageNumber: pageNum, + }; + caches.ocrModelCache.set(ocrKey, { + model: finalModel, + width: scaledViewport.width, + height: scaledViewport.height, + }); + } } caches.pageModelCache.set(cacheKey, finalModel); @@ -276,11 +370,29 @@ export async function loadComparisonPage( let finalModel = extractedModel; if (ctx.useOcr && shouldUseOcrForModel(extractedModel)) { - const ocrModel = await recognizePageCanvas(canvas, ctx.ocrLanguage); - finalModel = { - ...ocrModel, - pageNumber: pageNum, - }; + const ocrKey = getOcrCacheKey(side, pageNum); + const cachedOcr = caches.ocrModelCache.get(ocrKey); + if (cachedOcr) { + finalModel = rescalePageModel( + cachedOcr.model, + cachedOcr.width, + cachedOcr.height, + viewport.width, + viewport.height + ); + finalModel.pageNumber = pageNum; + } else { + const ocrModel = await recognizePageCanvas(canvas, ctx.ocrLanguage); + finalModel = { + ...ocrModel, + pageNumber: pageNum, + }; + caches.ocrModelCache.set(ocrKey, { + model: finalModel, + width: viewport.width, + height: viewport.height, + }); + } } canvas.width = 0; @@ -330,7 +442,10 @@ export async function computeComparisonForPair( ctx ); - const comparison = comparePageModels(leftPage.model, rightPage.model); + const comparison = await comparePageModelsAsync( + leftPage.model, + rightPage.model + ); comparison.confidence = pair.confidence; if ( diff --git a/src/pages/compare-pdfs.html b/src/pages/compare-pdfs.html index 2f4c717..03346aa 100644 --- a/src/pages/compare-pdfs.html +++ b/src/pages/compare-pdfs.html @@ -209,6 +209,14 @@ background: rgba(245, 158, 11, 0.28); } + .compare-highlight.moved { + background: rgba(168, 85, 247, 0.28); + } + + .compare-highlight.style-changed { + background: rgba(59, 130, 246, 0.28); + } + .compare-highlight.active { outline: 2px solid rgba(99, 102, 241, 0.7); outline-offset: 1px; @@ -232,11 +240,6 @@ display: none; } - .compare-change-item.active { - border-color: #818cf8; - background: rgba(79, 70, 229, 0.12); - } - .compare-sidebar { display: flex; flex-direction: column; @@ -245,30 +248,50 @@ border: 1px solid rgba(51, 65, 85, 0.5); border-radius: 0.75rem; overflow: hidden; - height: clamp(36rem, 82vh, 72rem); + min-height: 0; } .compare-sidebar-header { - padding: 0.75rem 1rem; + padding: 0.5rem 0.75rem; border-bottom: 1px solid rgba(51, 65, 85, 0.5); } .compare-sidebar-filters { display: flex; - align-items: center; - gap: 0.375rem; - padding: 0.625rem 1rem; + flex-direction: column; + gap: 0; + padding: 0; border-bottom: 1px solid rgba(51, 65, 85, 0.4); + } + + .compare-filter-group { + display: flex; flex-wrap: wrap; + align-items: center; + gap: 0.25rem; + padding: 0.375rem 0.75rem; + } + + .compare-filter-group + .compare-filter-group { + border-top: 1px solid rgba(51, 65, 85, 0.25); + } + + .compare-filter-label { + width: 100%; + font-size: 0.5625rem; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.08em; + color: #64748b; } .compare-pill { display: inline-flex; align-items: center; - gap: 0.25rem; + gap: 0.1875rem; border-radius: 9999px; - padding: 0.25rem 0.625rem; - font-size: 0.6875rem; + padding: 0.125rem 0.5rem; + font-size: 0.625rem; font-weight: 600; border: 1px solid transparent; cursor: pointer; @@ -309,51 +332,130 @@ border-color: rgba(34, 197, 94, 0.5); } + .compare-pill.moved { + color: #c4b5fd; + background: rgba(168, 85, 247, 0.1); + border-color: rgba(168, 85, 247, 0.15); + } + + .compare-pill.moved.active { + background: rgba(168, 85, 247, 0.25); + border-color: rgba(168, 85, 247, 0.5); + } + + .compare-pill.style-changed { + color: #93c5fd; + background: rgba(59, 130, 246, 0.1); + border-color: rgba(59, 130, 246, 0.15); + } + + .compare-pill.style-changed.active { + background: rgba(59, 130, 246, 0.25); + border-color: rgba(59, 130, 246, 0.5); + } + + .compare-pill.category { + color: #a5b4fc; + background: rgba(99, 102, 241, 0.1); + border-color: rgba(99, 102, 241, 0.15); + } + + .compare-pill.category.active { + background: rgba(99, 102, 241, 0.25); + border-color: rgba(99, 102, 241, 0.5); + } + + .compare-pill.category.disabled { + opacity: 0.35; + text-decoration: line-through; + } + .compare-change-list { flex: 1; min-height: 0; overflow-y: auto; + padding: 0.75rem; + } + + #compare-change-list { display: flex; flex-direction: column; - gap: 1rem; - padding: 1rem; + gap: 0.5rem; } .compare-change-item { - display: flex; - align-items: flex-start; - gap: 0.625rem; - padding: 0.75rem 1rem; + padding: 0.5rem 0.75rem; cursor: pointer; - transition: background 0.1s; + transition: background 0.15s; border: 1px solid rgba(51, 65, 85, 0.3); - border-left: 2px solid transparent; border-radius: 0.5rem; font-size: 0.8125rem; color: #cbd5e1; - line-height: 1.4; + line-height: 1.5; } .compare-change-item:hover { - background: rgba(99, 102, 241, 0.08); + background: rgba(99, 102, 241, 0.06); + border-color: rgba(99, 102, 241, 0.2); } .compare-change-item.active { - background: rgba(99, 102, 241, 0.15); - border-left: 2px solid #818cf8; + background: rgba(99, 102, 241, 0.1); + border-color: rgba(99, 102, 241, 0.3); + } + + .compare-section-header { + display: flex; + align-items: center; + gap: 0.375rem; + padding: 0.25rem 0; + margin-top: 0.25rem; + } + + .compare-section-label { + font-size: 0.6875rem; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.05em; + } + + .compare-section-label.added, + .compare-section-label.page-added { + color: #86efac; + } + .compare-section-label.removed, + .compare-section-label.page-removed { + color: #fca5a5; + } + .compare-section-label.modified { + color: #fcd34d; + } + .compare-section-label.moved { + color: #c4b5fd; + } + .compare-section-label.style-changed { + color: #93c5fd; + } + + .compare-section-count { + font-size: 0.625rem; + font-weight: 600; + color: #64748b; + } + + .compare-section-line { + flex: 1; + height: 1px; + background: rgba(51, 65, 85, 0.4); } .compare-change-dot { - width: 0.5rem; - height: 0.5rem; + width: 0.375rem; + height: 0.375rem; border-radius: 50%; flex-shrink: 0; } - .compare-change-item .compare-change-dot { - margin-top: 0.35rem; - } - .compare-change-dot.added { background: #22c55e; } @@ -369,36 +471,18 @@ .compare-change-dot.page-removed { background: #ef4444; } + .compare-change-dot.moved { + background: #a855f7; + } + .compare-change-dot.style-changed { + background: #3b82f6; + } .compare-change-desc { - flex: 1; - min-width: 0; - } - - .compare-change-desc-text { - white-space: normal; + font-size: 0.8125rem; + color: #e2e8f0; overflow-wrap: anywhere; - } - - .compare-change-type { - font-size: 0.625rem; - font-weight: 600; - text-transform: uppercase; - letter-spacing: 0.04em; - flex-shrink: 0; - margin-top: 0.2rem; - } - - .compare-change-type.added, - .compare-change-type.page-added { - color: #86efac; - } - .compare-change-type.removed, - .compare-change-type.page-removed { - color: #fca5a5; - } - .compare-change-type.modified { - color: #fcd34d; + white-space: pre-line; } .compare-change-empty { @@ -415,7 +499,7 @@ .compare-sidebar { height: auto; - max-height: 20rem; + max-height: 24rem; } .compare-viewer-wrapper.side-by-side-mode { @@ -592,6 +676,35 @@ +
+ + + 100% + + +
@@ -732,35 +856,71 @@
- - - - +
+
Change Type
+ + + + + +
+
+
Content
+ + + + + + +
diff --git a/src/tests/compare/diff-text-runs.test.ts b/src/tests/compare/diff-text-runs.test.ts index 2e8db58..1ab723e 100644 --- a/src/tests/compare/diff-text-runs.test.ts +++ b/src/tests/compare/diff-text-runs.test.ts @@ -6,7 +6,11 @@ import { mergeIntoLines, sortCompareTextItems, } from '@/js/compare/engine/extract-page-model.ts'; -import type { ComparePageModel, CompareTextItem } from '@/js/compare/types.ts'; +import type { + ComparePageModel, + CompareTextItem, + CompareWordToken, +} from '@/js/compare/types.ts'; function makeItem(id: string, text: string): CompareTextItem { return { @@ -39,7 +43,13 @@ describe('diffTextRuns', () => { [makeItem('a', 'Hello'), makeItem('c', 'there')] ); - expect(result.summary).toEqual({ added: 0, removed: 0, modified: 1 }); + expect(result.summary).toEqual({ + added: 0, + removed: 0, + modified: 1, + moved: 0, + styleChanged: 0, + }); expect(result.changes).toHaveLength(1); expect(result.changes[0].type).toBe('modified'); expect(result.changes[0].beforeText).toBe('world'); @@ -52,7 +62,13 @@ describe('diffTextRuns', () => { [makeItem('a', 'Hello'), makeItem('b', 'again')] ); - expect(result.summary).toEqual({ added: 1, removed: 0, modified: 0 }); + expect(result.summary).toEqual({ + added: 1, + removed: 0, + modified: 0, + moved: 0, + styleChanged: 0, + }); expect(result.changes[0].type).toBe('added'); }); @@ -86,7 +102,13 @@ describe('diffTextRuns', () => { ); expect(result.changes).toHaveLength(2); - expect(result.summary).toEqual({ added: 1, removed: 0, modified: 1 }); + expect(result.summary).toEqual({ + added: 1, + removed: 0, + modified: 1, + moved: 0, + styleChanged: 0, + }); expect( result.changes.some( (change) => @@ -308,6 +330,221 @@ describe('mergeIntoLines', () => { ); expect(result.changes).toHaveLength(0); - expect(result.summary).toEqual({ added: 0, removed: 0, modified: 0 }); + expect(result.summary).toEqual({ + added: 0, + removed: 0, + modified: 0, + moved: 0, + styleChanged: 0, + }); + }); +}); + +function makeItemWithTokens( + id: string, + text: string, + fontName?: string, + fontSize?: number +): CompareTextItem { + const words = text.split(/\s+/).filter(Boolean); + const charWidth = 10 / Math.max(text.length, 1); + let offset = 0; + const wordTokens: CompareWordToken[] = words.map((w) => { + const startIndex = text.indexOf(w, offset); + offset = startIndex + w.length; + return { + word: w, + compareWord: w.toLowerCase(), + rect: { + x: startIndex * charWidth, + y: 0, + width: w.length * charWidth, + height: 10, + }, + fontName, + fontSize, + }; + }); + return { + id, + text, + normalizedText: text, + rect: { x: 0, y: 0, width: 10, height: 10 }, + wordTokens, + }; +} + +describe('detectStyleChanges', () => { + it('detects font name change on identical text', () => { + const result = diffTextRuns( + [makeItemWithTokens('a', 'Hello world test', 'Arial', 12)], + [makeItemWithTokens('b', 'Hello world test', 'Times', 12)] + ); + + expect(result.summary.styleChanged).toBe(1); + expect(result.changes.some((c) => c.type === 'style-changed')).toBe(true); + }); + + it('detects font size change on identical text', () => { + const result = diffTextRuns( + [makeItemWithTokens('a', 'Hello world test', 'Arial', 12)], + [makeItemWithTokens('b', 'Hello world test', 'Arial', 16)] + ); + + expect(result.summary.styleChanged).toBe(1); + const sc = result.changes.find((c) => c.type === 'style-changed')!; + expect(sc.beforeText).toBe('Hello world test'); + }); + + it('ignores negligible font size difference', () => { + const result = diffTextRuns( + [makeItemWithTokens('a', 'Same text here', 'Arial', 12)], + [makeItemWithTokens('b', 'Same text here', 'Arial', 12.3)] + ); + + expect(result.summary.styleChanged).toBe(0); + }); + + it('reports no style change when fonts match', () => { + const result = diffTextRuns( + [makeItemWithTokens('a', 'Identical font', 'Arial', 12)], + [makeItemWithTokens('b', 'Identical font', 'Arial', 12)] + ); + + expect(result.changes).toHaveLength(0); + expect(result.summary.styleChanged).toBe(0); + }); + + it('ignores pdfjs document-scoped font name prefixes', () => { + const result = diffTextRuns( + [makeItemWithTokens('a', 'Same font here', 'g_d0_f3', 12)], + [makeItemWithTokens('b', 'Same font here', 'g_d1_f3', 12)] + ); + + expect(result.changes).toHaveLength(0); + expect(result.summary.styleChanged).toBe(0); + }); +}); + +describe('detectMovedText', () => { + it('detects moved text block with identical words', () => { + const result = diffTextRuns( + [ + makeItem('a', 'Introduction to the topic'), + makeItem('b', 'Another paragraph here'), + ], + [ + makeItem('c', 'Another paragraph here'), + makeItem('d', 'Introduction to the topic'), + ] + ); + + expect(result.summary.moved).toBeGreaterThanOrEqual(1); + expect(result.changes.some((c) => c.type === 'moved')).toBe(true); + expect(result.changes.some((c) => c.type === 'removed')).toBe(false); + expect(result.changes.some((c) => c.type === 'added')).toBe(false); + }); + + it('does not detect move for short text', () => { + const result = diffTextRuns( + [makeItem('a', 'Hi'), makeItem('b', 'World')], + [makeItem('c', 'World'), makeItem('d', 'Hi')] + ); + + expect(result.summary.moved).toBe(0); + }); + + it('does not detect move when text is dissimilar', () => { + const result = diffTextRuns( + [makeItem('a', 'This is the first paragraph with details')], + [makeItem('b', 'Completely different content and wording here')] + ); + + expect(result.summary.moved).toBe(0); + }); +}); + +describe('CJK segmentation in diffTextRuns', () => { + it('segments Chinese text into words', () => { + const result = diffTextRuns( + [makeItem('a', '日本語テストです')], + [makeItem('b', '日本語テストでした')] + ); + + expect(result.changes.length).toBeGreaterThan(0); + expect(result.summary.modified).toBeGreaterThanOrEqual(1); + }); + + it('reports no changes for identical CJK text', () => { + const result = diffTextRuns( + [makeItem('a', '日本語テストです')], + [makeItem('b', '日本語テストです')] + ); + + expect(result.changes).toHaveLength(0); + }); +}); + +describe('content categories', () => { + it('assigns text category to added/removed/modified changes', () => { + const result = diffTextRuns( + [makeItem('a', 'Hello world')], + [makeItem('b', 'Hello there')] + ); + + expect(result.changes).toHaveLength(1); + expect(result.changes[0].category).toBe('text'); + }); + + it('assigns formatting category to style-changed changes', () => { + const result = diffTextRuns( + [makeItemWithTokens('a', 'Hello world test', 'Arial', 12)], + [makeItemWithTokens('b', 'Hello world test', 'Times', 12)] + ); + + const styleChange = result.changes.find((c) => c.type === 'style-changed'); + expect(styleChange).toBeDefined(); + expect(styleChange!.category).toBe('formatting'); + }); + + it('assigns text category to moved changes', () => { + const result = diffTextRuns( + [ + makeItem('a', 'Introduction to the topic'), + makeItem('b', 'Another paragraph here'), + ], + [ + makeItem('c', 'Another paragraph here'), + makeItem('d', 'Introduction to the topic'), + ] + ); + + const movedChange = result.changes.find((c) => c.type === 'moved'); + expect(movedChange).toBeDefined(); + expect(movedChange!.category).toBe('text'); + }); + + it('includes categorySummary on page comparison result', () => { + const result = comparePageModels( + makePage(1, [makeItem('a', 'Hello')]), + makePage(1, [makeItem('b', 'World')]) + ); + + expect(result.categorySummary).toBeDefined(); + const total = Object.values(result.categorySummary).reduce( + (a, b) => a + b, + 0 + ); + expect(total).toBeGreaterThanOrEqual(1); + }); + + it('assigns text category to page-removed changes', () => { + const result = comparePageModels( + makePage(1, [makeItem('a', 'Only')]), + null + ); + + expect(result.changes[0].category).toBe('text'); + expect(result.categorySummary.text).toBe(1); }); });