refactor: update PDF comparison types and enhance UI for better usability
- Refactored CompareState to import from a centralized type definition. - Enhanced the compare-pdfs.html layout with improved styles for overlay and side-by-side modes. - Added new CSS styles for various UI components including panels, buttons, and highlights. - Implemented a new sidebar for displaying change summaries and filters. - Created unit tests for text comparison logic, including diffing text runs and page pairing. - Added tests for text normalization functions to ensure proper handling of punctuation and character normalization.
This commit is contained in:
43
package-lock.json
generated
43
package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "bento-pdf",
|
"name": "bento-pdf",
|
||||||
"version": "2.4.0",
|
"version": "2.4.1",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "bento-pdf",
|
"name": "bento-pdf",
|
||||||
"version": "2.4.0",
|
"version": "2.4.1",
|
||||||
"license": "AGPL-3.0-only",
|
"license": "AGPL-3.0-only",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@fontsource/cedarville-cursive": "^5.2.7",
|
"@fontsource/cedarville-cursive": "^5.2.7",
|
||||||
@@ -30,6 +30,7 @@
|
|||||||
"blob-stream": "^0.1.3",
|
"blob-stream": "^0.1.3",
|
||||||
"bwip-js": "^4.8.0",
|
"bwip-js": "^4.8.0",
|
||||||
"cropperjs": "^1.6.2",
|
"cropperjs": "^1.6.2",
|
||||||
|
"diff": "^8.0.3",
|
||||||
"embedpdf-snippet": "file:vendor/embedpdf/embedpdf-snippet-2.3.0.tgz",
|
"embedpdf-snippet": "file:vendor/embedpdf/embedpdf-snippet-2.3.0.tgz",
|
||||||
"heic2any": "^0.0.4",
|
"heic2any": "^0.0.4",
|
||||||
"highlight.js": "^11.11.1",
|
"highlight.js": "^11.11.1",
|
||||||
@@ -55,11 +56,13 @@
|
|||||||
"markdown-it-task-lists": "^2.1.1",
|
"markdown-it-task-lists": "^2.1.1",
|
||||||
"markdown-it-toc-done-right": "^4.2.0",
|
"markdown-it-toc-done-right": "^4.2.0",
|
||||||
"mermaid": "^11.12.3",
|
"mermaid": "^11.12.3",
|
||||||
|
"microdiff": "^1.5.0",
|
||||||
"node-forge": "^1.3.3",
|
"node-forge": "^1.3.3",
|
||||||
"papaparse": "^5.5.3",
|
"papaparse": "^5.5.3",
|
||||||
"pdf-lib": "^1.17.1",
|
"pdf-lib": "^1.17.1",
|
||||||
"pdfjs-dist": "^5.4.624",
|
"pdfjs-dist": "^5.4.624",
|
||||||
"pdfkit": "^0.17.2",
|
"pdfkit": "^0.17.2",
|
||||||
|
"pixelmatch": "^7.1.0",
|
||||||
"postal-mime": "^2.7.3",
|
"postal-mime": "^2.7.3",
|
||||||
"rete": "^2.0.6",
|
"rete": "^2.0.6",
|
||||||
"rete-area-plugin": "^2.1.5",
|
"rete-area-plugin": "^2.1.5",
|
||||||
@@ -6353,6 +6356,15 @@
|
|||||||
"integrity": "sha512-ED3jP8saaweFTjeGX8HQPjeC1YYyZs98jGNZx6IiBvxW7JG5v492kamAQB3m2wop07CvU/RQmzcKr6bgcC5D/Q==",
|
"integrity": "sha512-ED3jP8saaweFTjeGX8HQPjeC1YYyZs98jGNZx6IiBvxW7JG5v492kamAQB3m2wop07CvU/RQmzcKr6bgcC5D/Q==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/diff": {
|
||||||
|
"version": "8.0.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/diff/-/diff-8.0.3.tgz",
|
||||||
|
"integrity": "sha512-qejHi7bcSD4hQAZE0tNAawRK1ZtafHDmMTMkrrIGgSLl7hTnQHmKCeB45xAcbfTqK2zowkM3j3bHt/4b/ARbYQ==",
|
||||||
|
"license": "BSD-3-Clause",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=0.3.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/diffie-hellman": {
|
"node_modules/diffie-hellman": {
|
||||||
"version": "5.0.3",
|
"version": "5.0.3",
|
||||||
"resolved": "https://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz",
|
"resolved": "https://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz",
|
||||||
@@ -9068,6 +9080,12 @@
|
|||||||
"uuid": "^11.1.0"
|
"uuid": "^11.1.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/microdiff": {
|
||||||
|
"version": "1.5.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/microdiff/-/microdiff-1.5.0.tgz",
|
||||||
|
"integrity": "sha512-Drq+/THMvDdzRYrK0oxJmOKiC24ayUV8ahrt8l3oRK51PWt6gdtrIGrlIH3pT/lFh1z93FbAcidtsHcWbnRz8Q==",
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/micromark-util-character": {
|
"node_modules/micromark-util-character": {
|
||||||
"version": "2.1.1",
|
"version": "2.1.1",
|
||||||
"resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.1.tgz",
|
||||||
@@ -9896,6 +9914,18 @@
|
|||||||
"url": "https://github.com/sponsors/jonschlinkert"
|
"url": "https://github.com/sponsors/jonschlinkert"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/pixelmatch": {
|
||||||
|
"version": "7.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/pixelmatch/-/pixelmatch-7.1.0.tgz",
|
||||||
|
"integrity": "sha512-1wrVzJ2STrpmONHKBy228LM1b84msXDUoAzVEl0R8Mz4Ce6EPr+IVtxm8+yvrqLYMHswREkjYFaMxnyGnaY3Ng==",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"pngjs": "^7.0.0"
|
||||||
|
},
|
||||||
|
"bin": {
|
||||||
|
"pixelmatch": "bin/pixelmatch"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/pkg-dir": {
|
"node_modules/pkg-dir": {
|
||||||
"version": "5.0.0",
|
"version": "5.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-5.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-5.0.0.tgz",
|
||||||
@@ -9925,6 +9955,15 @@
|
|||||||
"resolved": "https://registry.npmjs.org/png-js/-/png-js-1.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/png-js/-/png-js-1.0.0.tgz",
|
||||||
"integrity": "sha512-k+YsbhpA9e+EFfKjTCH3VW6aoKlyNYI6NYdTfDL4CIvFnvsuO84ttonmZE7rc+v23SLTH8XX+5w/Ak9v0xGY4g=="
|
"integrity": "sha512-k+YsbhpA9e+EFfKjTCH3VW6aoKlyNYI6NYdTfDL4CIvFnvsuO84ttonmZE7rc+v23SLTH8XX+5w/Ak9v0xGY4g=="
|
||||||
},
|
},
|
||||||
|
"node_modules/pngjs": {
|
||||||
|
"version": "7.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/pngjs/-/pngjs-7.0.0.tgz",
|
||||||
|
"integrity": "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=14.19.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/points-on-curve": {
|
"node_modules/points-on-curve": {
|
||||||
"version": "0.2.0",
|
"version": "0.2.0",
|
||||||
"resolved": "https://registry.npmjs.org/points-on-curve/-/points-on-curve-0.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/points-on-curve/-/points-on-curve-0.2.0.tgz",
|
||||||
|
|||||||
@@ -86,6 +86,7 @@
|
|||||||
"blob-stream": "^0.1.3",
|
"blob-stream": "^0.1.3",
|
||||||
"bwip-js": "^4.8.0",
|
"bwip-js": "^4.8.0",
|
||||||
"cropperjs": "^1.6.2",
|
"cropperjs": "^1.6.2",
|
||||||
|
"diff": "^8.0.3",
|
||||||
"embedpdf-snippet": "file:vendor/embedpdf/embedpdf-snippet-2.3.0.tgz",
|
"embedpdf-snippet": "file:vendor/embedpdf/embedpdf-snippet-2.3.0.tgz",
|
||||||
"heic2any": "^0.0.4",
|
"heic2any": "^0.0.4",
|
||||||
"highlight.js": "^11.11.1",
|
"highlight.js": "^11.11.1",
|
||||||
@@ -111,11 +112,13 @@
|
|||||||
"markdown-it-task-lists": "^2.1.1",
|
"markdown-it-task-lists": "^2.1.1",
|
||||||
"markdown-it-toc-done-right": "^4.2.0",
|
"markdown-it-toc-done-right": "^4.2.0",
|
||||||
"mermaid": "^11.12.3",
|
"mermaid": "^11.12.3",
|
||||||
|
"microdiff": "^1.5.0",
|
||||||
"node-forge": "^1.3.3",
|
"node-forge": "^1.3.3",
|
||||||
"papaparse": "^5.5.3",
|
"papaparse": "^5.5.3",
|
||||||
"pdf-lib": "^1.17.1",
|
"pdf-lib": "^1.17.1",
|
||||||
"pdfjs-dist": "^5.4.624",
|
"pdfjs-dist": "^5.4.624",
|
||||||
"pdfkit": "^0.17.2",
|
"pdfkit": "^0.17.2",
|
||||||
|
"pixelmatch": "^7.1.0",
|
||||||
"postal-mime": "^2.7.3",
|
"postal-mime": "^2.7.3",
|
||||||
"rete": "^2.0.6",
|
"rete": "^2.0.6",
|
||||||
"rete-area-plugin": "^2.1.5",
|
"rete-area-plugin": "^2.1.5",
|
||||||
|
|||||||
@@ -238,19 +238,6 @@ input[type='file']::file-selector-button {
|
|||||||
position: relative;
|
position: relative;
|
||||||
width: 100%;
|
width: 100%;
|
||||||
height: 75vh;
|
height: 75vh;
|
||||||
overflow: auto;
|
|
||||||
border: 2px solid #374151;
|
|
||||||
border-radius: 0.5rem;
|
|
||||||
background-color: #1f2937;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* This rule now ONLY applies to canvases in overlay mode */
|
|
||||||
.compare-viewer-wrapper.overlay-mode canvas {
|
|
||||||
position: absolute;
|
|
||||||
top: 0;
|
|
||||||
left: 0;
|
|
||||||
width: 100%;
|
|
||||||
height: auto;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.compare-viewer-wrapper.side-by-side-mode {
|
.compare-viewer-wrapper.side-by-side-mode {
|
||||||
|
|||||||
78
src/js/compare/engine/compare-page-models.ts
Normal file
78
src/js/compare/engine/compare-page-models.ts
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import type { ComparePageModel, ComparePageResult } from '../types.ts';
|
||||||
|
import { diffTextRuns } from './diff-text-runs.ts';
|
||||||
|
|
||||||
|
export function comparePageModels(
|
||||||
|
leftPage: ComparePageModel | null,
|
||||||
|
rightPage: ComparePageModel | null
|
||||||
|
): ComparePageResult {
|
||||||
|
if (leftPage && !rightPage) {
|
||||||
|
return {
|
||||||
|
status: 'left-only',
|
||||||
|
leftPageNumber: leftPage.pageNumber,
|
||||||
|
rightPageNumber: null,
|
||||||
|
changes: [
|
||||||
|
{
|
||||||
|
id: 'page-removed',
|
||||||
|
type: 'page-removed',
|
||||||
|
description: `Page ${leftPage.pageNumber} exists only in the first PDF.`,
|
||||||
|
beforeText: leftPage.plainText.slice(0, 200),
|
||||||
|
afterText: '',
|
||||||
|
beforeRects: [],
|
||||||
|
afterRects: [],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
summary: { added: 0, removed: 1, modified: 0 },
|
||||||
|
visualDiff: null,
|
||||||
|
usedOcr: leftPage.source === 'ocr',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!leftPage && rightPage) {
|
||||||
|
return {
|
||||||
|
status: 'right-only',
|
||||||
|
leftPageNumber: null,
|
||||||
|
rightPageNumber: rightPage.pageNumber,
|
||||||
|
changes: [
|
||||||
|
{
|
||||||
|
id: 'page-added',
|
||||||
|
type: 'page-added',
|
||||||
|
description: `Page ${rightPage.pageNumber} exists only in the second PDF.`,
|
||||||
|
beforeText: '',
|
||||||
|
afterText: rightPage.plainText.slice(0, 200),
|
||||||
|
beforeRects: [],
|
||||||
|
afterRects: [],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
summary: { added: 1, removed: 0, modified: 0 },
|
||||||
|
visualDiff: null,
|
||||||
|
usedOcr: rightPage.source === 'ocr',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!leftPage || !rightPage) {
|
||||||
|
return {
|
||||||
|
status: 'match',
|
||||||
|
leftPageNumber: null,
|
||||||
|
rightPageNumber: null,
|
||||||
|
changes: [],
|
||||||
|
summary: { added: 0, removed: 0, modified: 0 },
|
||||||
|
visualDiff: null,
|
||||||
|
usedOcr: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const { changes, summary } = diffTextRuns(
|
||||||
|
leftPage.textItems,
|
||||||
|
rightPage.textItems
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
status: changes.length > 0 ? 'changed' : 'match',
|
||||||
|
leftPageNumber: leftPage.pageNumber,
|
||||||
|
rightPageNumber: rightPage.pageNumber,
|
||||||
|
changes,
|
||||||
|
summary,
|
||||||
|
visualDiff: null,
|
||||||
|
usedOcr: leftPage.source === 'ocr' || rightPage.source === 'ocr',
|
||||||
|
};
|
||||||
|
}
|
||||||
237
src/js/compare/engine/diff-text-runs.ts
Normal file
237
src/js/compare/engine/diff-text-runs.ts
Normal file
@@ -0,0 +1,237 @@
|
|||||||
|
import { diffArrays } from 'diff';
|
||||||
|
|
||||||
|
import type {
|
||||||
|
CharPosition,
|
||||||
|
CompareChangeSummary,
|
||||||
|
CompareRectangle,
|
||||||
|
CompareTextChange,
|
||||||
|
CompareTextItem,
|
||||||
|
CompareWordToken,
|
||||||
|
} from '../types.ts';
|
||||||
|
|
||||||
|
interface WordToken {
|
||||||
|
word: string;
|
||||||
|
compareWord: string;
|
||||||
|
rect: CompareRectangle;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getCharMap(line: CompareTextItem): CharPosition[] {
|
||||||
|
if (line.charMap && line.charMap.length === line.normalizedText.length) {
|
||||||
|
return line.charMap;
|
||||||
|
}
|
||||||
|
const charWidth = line.rect.width / Math.max(line.normalizedText.length, 1);
|
||||||
|
return Array.from({ length: line.normalizedText.length }, (_, i) => ({
|
||||||
|
x: line.rect.x + i * charWidth,
|
||||||
|
width: charWidth,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
function splitLineIntoWords(line: CompareTextItem): WordToken[] {
|
||||||
|
if (line.wordTokens && line.wordTokens.length > 0) {
|
||||||
|
return line.wordTokens.map((token: CompareWordToken) => ({
|
||||||
|
word: token.word,
|
||||||
|
compareWord: token.compareWord,
|
||||||
|
rect: token.rect,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
const words = line.normalizedText.split(/\s+/).filter(Boolean);
|
||||||
|
if (words.length === 0) return [];
|
||||||
|
|
||||||
|
const charMap = getCharMap(line);
|
||||||
|
let offset = 0;
|
||||||
|
|
||||||
|
return words.map((word) => {
|
||||||
|
const startIndex = line.normalizedText.indexOf(word, offset);
|
||||||
|
const endIndex = startIndex + word.length - 1;
|
||||||
|
offset = startIndex + word.length;
|
||||||
|
|
||||||
|
const startChar = charMap[startIndex];
|
||||||
|
const endChar = charMap[endIndex];
|
||||||
|
|
||||||
|
if (!startChar || !endChar) {
|
||||||
|
const charWidth =
|
||||||
|
line.rect.width / Math.max(line.normalizedText.length, 1);
|
||||||
|
return {
|
||||||
|
word,
|
||||||
|
compareWord: word.toLowerCase(),
|
||||||
|
rect: {
|
||||||
|
x: line.rect.x + startIndex * charWidth,
|
||||||
|
y: line.rect.y,
|
||||||
|
width: word.length * charWidth,
|
||||||
|
height: line.rect.height,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const x = startChar.x;
|
||||||
|
const w = endChar.x + endChar.width - startChar.x;
|
||||||
|
|
||||||
|
return {
|
||||||
|
word,
|
||||||
|
compareWord: word.toLowerCase(),
|
||||||
|
rect: { x, y: line.rect.y, width: w, height: line.rect.height },
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function groupAdjacentRects(rects: CompareRectangle[]): CompareRectangle[] {
|
||||||
|
if (rects.length === 0) return [];
|
||||||
|
|
||||||
|
const sorted = [...rects].sort((a, b) => a.y - b.y || a.x - b.x);
|
||||||
|
const groups: CompareRectangle[][] = [[sorted[0]]];
|
||||||
|
|
||||||
|
for (let i = 1; i < sorted.length; i++) {
|
||||||
|
const prev = groups[groups.length - 1];
|
||||||
|
const lastRect = prev[prev.length - 1];
|
||||||
|
const curr = sorted[i];
|
||||||
|
const sameLine =
|
||||||
|
Math.abs(curr.y - lastRect.y) < Math.max(lastRect.height * 0.6, 4);
|
||||||
|
const close = curr.x <= lastRect.x + lastRect.width + lastRect.height * 2;
|
||||||
|
|
||||||
|
if (sameLine && close) {
|
||||||
|
prev.push(curr);
|
||||||
|
} else {
|
||||||
|
groups.push([curr]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return groups.map((group) => {
|
||||||
|
const minX = Math.min(...group.map((r) => r.x));
|
||||||
|
const minY = Math.min(...group.map((r) => r.y));
|
||||||
|
const maxX = Math.max(...group.map((r) => r.x + r.width));
|
||||||
|
const maxY = Math.max(...group.map((r) => r.y + r.height));
|
||||||
|
return { x: minX, y: minY, width: maxX - minX, height: maxY - minY };
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function collapseWords(words: WordToken[]) {
|
||||||
|
return words.map((word) => word.compareWord).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
function areEquivalentIgnoringWordBreaks(
|
||||||
|
beforeWords: WordToken[],
|
||||||
|
afterWords: WordToken[]
|
||||||
|
) {
|
||||||
|
if (beforeWords.length === 0 || afterWords.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return collapseWords(beforeWords) === collapseWords(afterWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
function createWordChange(
|
||||||
|
changes: CompareTextChange[],
|
||||||
|
type: CompareTextChange['type'],
|
||||||
|
beforeWords: WordToken[],
|
||||||
|
afterWords: WordToken[]
|
||||||
|
) {
|
||||||
|
const beforeText = beforeWords.map((w) => w.word).join(' ');
|
||||||
|
const afterText = afterWords.map((w) => w.word).join(' ');
|
||||||
|
if (!beforeText && !afterText) return;
|
||||||
|
|
||||||
|
const id = `${type}-${changes.length}`;
|
||||||
|
const beforeRects = groupAdjacentRects(beforeWords.map((w) => w.rect));
|
||||||
|
const afterRects = groupAdjacentRects(afterWords.map((w) => w.rect));
|
||||||
|
|
||||||
|
if (type === 'modified') {
|
||||||
|
changes.push({
|
||||||
|
id,
|
||||||
|
type,
|
||||||
|
description: `Replaced "${beforeText}" with "${afterText}"`,
|
||||||
|
beforeText,
|
||||||
|
afterText,
|
||||||
|
beforeRects,
|
||||||
|
afterRects,
|
||||||
|
});
|
||||||
|
} else if (type === 'removed') {
|
||||||
|
changes.push({
|
||||||
|
id,
|
||||||
|
type,
|
||||||
|
description: `Removed "${beforeText}"`,
|
||||||
|
beforeText,
|
||||||
|
afterText: '',
|
||||||
|
beforeRects,
|
||||||
|
afterRects: [],
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
changes.push({
|
||||||
|
id,
|
||||||
|
type,
|
||||||
|
description: `Added "${afterText}"`,
|
||||||
|
beforeText: '',
|
||||||
|
afterText,
|
||||||
|
beforeRects: [],
|
||||||
|
afterRects,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function toSummary(changes: CompareTextChange[]): CompareChangeSummary {
|
||||||
|
return changes.reduce(
|
||||||
|
(summary, change) => {
|
||||||
|
if (change.type === 'added') summary.added += 1;
|
||||||
|
if (change.type === 'removed') summary.removed += 1;
|
||||||
|
if (change.type === 'modified') summary.modified += 1;
|
||||||
|
return summary;
|
||||||
|
},
|
||||||
|
{ added: 0, removed: 0, modified: 0 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function diffTextRuns(
|
||||||
|
beforeItems: CompareTextItem[],
|
||||||
|
afterItems: CompareTextItem[]
|
||||||
|
) {
|
||||||
|
const beforeWords = beforeItems.flatMap(splitLineIntoWords);
|
||||||
|
const afterWords = afterItems.flatMap(splitLineIntoWords);
|
||||||
|
|
||||||
|
const rawChanges = diffArrays(
|
||||||
|
beforeWords.map((w) => w.compareWord),
|
||||||
|
afterWords.map((w) => w.compareWord)
|
||||||
|
);
|
||||||
|
|
||||||
|
const changes: CompareTextChange[] = [];
|
||||||
|
let beforeIndex = 0;
|
||||||
|
let afterIndex = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < rawChanges.length; i++) {
|
||||||
|
const change = rawChanges[i];
|
||||||
|
const count = change.value.length;
|
||||||
|
|
||||||
|
if (change.removed) {
|
||||||
|
const removedTokens = beforeWords.slice(beforeIndex, beforeIndex + count);
|
||||||
|
beforeIndex += count;
|
||||||
|
|
||||||
|
const next = rawChanges[i + 1];
|
||||||
|
if (next?.added) {
|
||||||
|
const addedTokens = afterWords.slice(
|
||||||
|
afterIndex,
|
||||||
|
afterIndex + next.value.length
|
||||||
|
);
|
||||||
|
afterIndex += next.value.length;
|
||||||
|
if (areEquivalentIgnoringWordBreaks(removedTokens, addedTokens)) {
|
||||||
|
i++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
createWordChange(changes, 'modified', removedTokens, addedTokens);
|
||||||
|
i++;
|
||||||
|
} else {
|
||||||
|
createWordChange(changes, 'removed', removedTokens, []);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (change.added) {
|
||||||
|
const addedTokens = afterWords.slice(afterIndex, afterIndex + count);
|
||||||
|
afterIndex += count;
|
||||||
|
createWordChange(changes, 'added', [], addedTokens);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
beforeIndex += count;
|
||||||
|
afterIndex += count;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { changes, summary: toSummary(changes) };
|
||||||
|
}
|
||||||
520
src/js/compare/engine/extract-page-model.ts
Normal file
520
src/js/compare/engine/extract-page-model.ts
Normal file
@@ -0,0 +1,520 @@
|
|||||||
|
import * as pdfjsLib from 'pdfjs-dist';
|
||||||
|
|
||||||
|
import type {
|
||||||
|
ComparePageModel,
|
||||||
|
CompareTextItem,
|
||||||
|
CharPosition,
|
||||||
|
CompareWordToken,
|
||||||
|
} from '../types.ts';
|
||||||
|
import {
|
||||||
|
joinCompareTextItems,
|
||||||
|
normalizeCompareText,
|
||||||
|
} from './text-normalization.ts';
|
||||||
|
|
||||||
|
type PageTextItem = {
|
||||||
|
str: string;
|
||||||
|
width: number;
|
||||||
|
height: number;
|
||||||
|
transform: number[];
|
||||||
|
dir: string;
|
||||||
|
fontName: string;
|
||||||
|
hasEOL: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
type TextStyles = Record<string, { fontFamily?: string }>;
|
||||||
|
|
||||||
|
const measurementCanvas =
|
||||||
|
typeof document !== 'undefined' ? document.createElement('canvas') : null;
|
||||||
|
const measurementContext = measurementCanvas
|
||||||
|
? measurementCanvas.getContext('2d')
|
||||||
|
: null;
|
||||||
|
const textMeasurementCache: Map<string, number> | null = measurementContext
|
||||||
|
? new Map()
|
||||||
|
: null;
|
||||||
|
let lastMeasurementFont = '';
|
||||||
|
|
||||||
|
const DEFAULT_CHAR_WIDTH = 1;
|
||||||
|
const DEFAULT_SPACE_WIDTH = 0.33;
|
||||||
|
|
||||||
|
function shouldJoinTokenWithPrevious(previous: string, current: string) {
|
||||||
|
if (!previous) return false;
|
||||||
|
if (/^[,.;:!?%)\]}]/.test(current)) return true;
|
||||||
|
if (/^[''"'’”]/u.test(current)) return true;
|
||||||
|
if (/[([{/"'“‘-]$/u.test(previous)) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function measureTextWidth(fontSpec: string, text: string): number {
|
||||||
|
if (!measurementContext) {
|
||||||
|
if (!text) return 0;
|
||||||
|
if (text === ' ') return DEFAULT_SPACE_WIDTH;
|
||||||
|
return text.length * DEFAULT_CHAR_WIDTH;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lastMeasurementFont !== fontSpec) {
|
||||||
|
measurementContext.font = fontSpec;
|
||||||
|
lastMeasurementFont = fontSpec;
|
||||||
|
}
|
||||||
|
|
||||||
|
const key = `${fontSpec}|${text}`;
|
||||||
|
const cached = textMeasurementCache?.get(key);
|
||||||
|
if (cached !== undefined) {
|
||||||
|
return cached;
|
||||||
|
}
|
||||||
|
|
||||||
|
const width = measurementContext.measureText(text).width || 0;
|
||||||
|
textMeasurementCache?.set(key, width);
|
||||||
|
return width;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildItemWordTokens(
|
||||||
|
viewport: pdfjsLib.PageViewport,
|
||||||
|
item: PageTextItem,
|
||||||
|
fallbackRect: CompareTextItem['rect'],
|
||||||
|
styles: TextStyles
|
||||||
|
): CompareWordToken[] {
|
||||||
|
const rawText = item.str || '';
|
||||||
|
if (!rawText.trim()) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const totalLen = Math.max(rawText.length, 1);
|
||||||
|
const textStyle = item.fontName ? styles[item.fontName] : undefined;
|
||||||
|
const fontFamily = textStyle?.fontFamily ?? 'sans-serif';
|
||||||
|
const fontScale = Math.max(
|
||||||
|
0.5,
|
||||||
|
Math.hypot(item.transform[0], item.transform[1]) || 0
|
||||||
|
);
|
||||||
|
const fontSpec = `${fontScale}px ${fontFamily}`;
|
||||||
|
|
||||||
|
const weights: number[] = new Array(totalLen);
|
||||||
|
let runningText = '';
|
||||||
|
let previousAdvance = 0;
|
||||||
|
for (let index = 0; index < totalLen; index += 1) {
|
||||||
|
runningText += rawText[index];
|
||||||
|
const advance = measureTextWidth(fontSpec, runningText);
|
||||||
|
let width = advance - previousAdvance;
|
||||||
|
if (!Number.isFinite(width) || width <= 0) {
|
||||||
|
width = rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH;
|
||||||
|
}
|
||||||
|
weights[index] = width;
|
||||||
|
previousAdvance = advance;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!Number.isFinite(previousAdvance) || previousAdvance <= 0) {
|
||||||
|
for (let index = 0; index < totalLen; index += 1) {
|
||||||
|
weights[index] =
|
||||||
|
rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const prefix: number[] = new Array(totalLen + 1);
|
||||||
|
prefix[0] = 0;
|
||||||
|
for (let index = 0; index < totalLen; index += 1) {
|
||||||
|
prefix[index + 1] = prefix[index] + weights[index];
|
||||||
|
}
|
||||||
|
const totalWeight = prefix[totalLen] || 1;
|
||||||
|
|
||||||
|
const rawX = item.transform[4];
|
||||||
|
const rawY = item.transform[5];
|
||||||
|
const transformed = [
|
||||||
|
viewport.convertToViewportPoint(rawX, rawY),
|
||||||
|
viewport.convertToViewportPoint(rawX + item.width, rawY),
|
||||||
|
viewport.convertToViewportPoint(rawX, rawY + item.height),
|
||||||
|
viewport.convertToViewportPoint(rawX + item.width, rawY + item.height),
|
||||||
|
];
|
||||||
|
const xs = transformed.map(([x]) => x);
|
||||||
|
const ys = transformed.map(([, y]) => y);
|
||||||
|
const left = Math.min(...xs);
|
||||||
|
const right = Math.max(...xs);
|
||||||
|
const top = Math.min(...ys);
|
||||||
|
const bottom = Math.max(...ys);
|
||||||
|
|
||||||
|
const [baselineStart, baselineEnd, verticalEnd] = transformed;
|
||||||
|
const baselineVector: [number, number] = [
|
||||||
|
baselineEnd[0] - baselineStart[0],
|
||||||
|
baselineEnd[1] - baselineStart[1],
|
||||||
|
];
|
||||||
|
const verticalVector: [number, number] = [
|
||||||
|
verticalEnd[0] - baselineStart[0],
|
||||||
|
verticalEnd[1] - baselineStart[1],
|
||||||
|
];
|
||||||
|
const hasOrientationVectors =
|
||||||
|
Math.hypot(baselineVector[0], baselineVector[1]) > 1e-6 &&
|
||||||
|
Math.hypot(verticalVector[0], verticalVector[1]) > 1e-6;
|
||||||
|
|
||||||
|
const tokens: CompareWordToken[] = [];
|
||||||
|
const wordRegex = /\S+/gu;
|
||||||
|
let match: RegExpExecArray | null;
|
||||||
|
let previousEnd = 0;
|
||||||
|
|
||||||
|
while ((match = wordRegex.exec(rawText)) !== null) {
|
||||||
|
const tokenText = match[0];
|
||||||
|
const normalizedWord = normalizeCompareText(tokenText);
|
||||||
|
if (!normalizedWord) {
|
||||||
|
previousEnd = match.index + tokenText.length;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const startIndex = match.index;
|
||||||
|
const endIndex = startIndex + tokenText.length;
|
||||||
|
const relStart = prefix[startIndex] / totalWeight;
|
||||||
|
const relEnd = prefix[endIndex] / totalWeight;
|
||||||
|
|
||||||
|
let wordLeft: number;
|
||||||
|
let wordRight: number;
|
||||||
|
let wordTop: number;
|
||||||
|
let wordBottom: number;
|
||||||
|
|
||||||
|
if (hasOrientationVectors) {
|
||||||
|
const segStart: [number, number] = [
|
||||||
|
baselineStart[0] + baselineVector[0] * relStart,
|
||||||
|
baselineStart[1] + baselineVector[1] * relStart,
|
||||||
|
];
|
||||||
|
const segEnd: [number, number] = [
|
||||||
|
baselineStart[0] + baselineVector[0] * relEnd,
|
||||||
|
baselineStart[1] + baselineVector[1] * relEnd,
|
||||||
|
];
|
||||||
|
const cornerPoints: Array<[number, number]> = [
|
||||||
|
segStart,
|
||||||
|
[segStart[0] + verticalVector[0], segStart[1] + verticalVector[1]],
|
||||||
|
[segEnd[0] + verticalVector[0], segEnd[1] + verticalVector[1]],
|
||||||
|
segEnd,
|
||||||
|
];
|
||||||
|
wordLeft = Math.min(...cornerPoints.map(([x]) => x));
|
||||||
|
wordRight = Math.max(...cornerPoints.map(([x]) => x));
|
||||||
|
wordTop = Math.min(...cornerPoints.map(([, y]) => y));
|
||||||
|
wordBottom = Math.max(...cornerPoints.map(([, y]) => y));
|
||||||
|
} else {
|
||||||
|
const segLeft = left + (right - left) * relStart;
|
||||||
|
const segRight = left + (right - left) * relEnd;
|
||||||
|
wordLeft = Math.min(segLeft, segRight);
|
||||||
|
wordRight = Math.max(segLeft, segRight);
|
||||||
|
wordTop = top;
|
||||||
|
wordBottom = bottom;
|
||||||
|
}
|
||||||
|
|
||||||
|
const width = Math.max(wordRight - wordLeft, 1);
|
||||||
|
const height = Math.max(wordBottom - wordTop, fallbackRect.height);
|
||||||
|
const gapText = rawText.slice(previousEnd, startIndex);
|
||||||
|
|
||||||
|
const previousToken = tokens[tokens.length - 1];
|
||||||
|
|
||||||
|
tokens.push({
|
||||||
|
word: normalizedWord,
|
||||||
|
compareWord: normalizedWord.toLowerCase(),
|
||||||
|
rect: {
|
||||||
|
x: Number.isFinite(wordLeft) ? wordLeft : fallbackRect.x,
|
||||||
|
y: Number.isFinite(wordTop) ? wordTop : fallbackRect.y,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
},
|
||||||
|
joinsWithPrevious:
|
||||||
|
(gapText.length > 0 && !/\s/u.test(gapText)) ||
|
||||||
|
(previousToken
|
||||||
|
? shouldJoinTokenWithPrevious(previousToken.word, normalizedWord)
|
||||||
|
: false),
|
||||||
|
});
|
||||||
|
|
||||||
|
previousEnd = endIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toRect(
|
||||||
|
viewport: pdfjsLib.PageViewport,
|
||||||
|
item: PageTextItem,
|
||||||
|
index: number,
|
||||||
|
styles: TextStyles
|
||||||
|
) {
|
||||||
|
const normalizedText = normalizeCompareText(item.str);
|
||||||
|
|
||||||
|
const transformed = pdfjsLib.Util.transform(
|
||||||
|
viewport.transform,
|
||||||
|
item.transform
|
||||||
|
);
|
||||||
|
const width = Math.max(item.width * viewport.scale, 1);
|
||||||
|
const height = Math.max(
|
||||||
|
Math.abs(transformed[3]) || item.height * viewport.scale,
|
||||||
|
1
|
||||||
|
);
|
||||||
|
const x = transformed[4];
|
||||||
|
const y = transformed[5] - height;
|
||||||
|
|
||||||
|
const rect = {
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: `${index}-${normalizedText}`,
|
||||||
|
text: item.str,
|
||||||
|
normalizedText,
|
||||||
|
rect,
|
||||||
|
wordTokens: buildItemWordTokens(viewport, item, rect, styles),
|
||||||
|
} satisfies CompareTextItem;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function sortCompareTextItems(items: CompareTextItem[]) {
|
||||||
|
return [...items].sort((left, right) => {
|
||||||
|
const lineTolerance = Math.max(
|
||||||
|
Math.min(left.rect.height, right.rect.height) * 0.6,
|
||||||
|
4
|
||||||
|
);
|
||||||
|
const topDiff = left.rect.y - right.rect.y;
|
||||||
|
|
||||||
|
if (Math.abs(topDiff) > lineTolerance) {
|
||||||
|
return topDiff;
|
||||||
|
}
|
||||||
|
|
||||||
|
const xDiff = left.rect.x - right.rect.x;
|
||||||
|
if (Math.abs(xDiff) > 1) {
|
||||||
|
return xDiff;
|
||||||
|
}
|
||||||
|
|
||||||
|
return left.id.localeCompare(right.id);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function averageCharacterWidth(item: CompareTextItem) {
|
||||||
|
const compactText = item.normalizedText.replace(/\s+/g, '');
|
||||||
|
return item.rect.width / Math.max(compactText.length, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
function shouldInsertSpaceBetweenItems(
|
||||||
|
left: CompareTextItem,
|
||||||
|
right: CompareTextItem
|
||||||
|
) {
|
||||||
|
if (!left.normalizedText || !right.normalizedText) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/^[,.;:!?%)\]}]/.test(right.normalizedText)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/^[''"'’”]/u.test(right.normalizedText)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/[([{/"'“‘-]$/u.test(left.normalizedText)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const gap = right.rect.x - (left.rect.x + left.rect.width);
|
||||||
|
if (gap <= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const leftWidth = averageCharacterWidth(left);
|
||||||
|
const rightWidth = averageCharacterWidth(right);
|
||||||
|
const threshold = Math.max(Math.min(leftWidth, rightWidth) * 0.45, 1.5);
|
||||||
|
|
||||||
|
return gap >= threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
function mergeLineText(lineItems: CompareTextItem[]): {
|
||||||
|
text: string;
|
||||||
|
charMap: CharPosition[];
|
||||||
|
} {
|
||||||
|
if (lineItems.length === 0) {
|
||||||
|
return { text: '', charMap: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
const charMap: CharPosition[] = [];
|
||||||
|
|
||||||
|
function pushFragChars(frag: CompareTextItem) {
|
||||||
|
const fragText = frag.normalizedText;
|
||||||
|
const fragCharWidth = frag.rect.width / Math.max(fragText.length, 1);
|
||||||
|
for (let ci = 0; ci < fragText.length; ci++) {
|
||||||
|
charMap.push({
|
||||||
|
x: frag.rect.x + ci * fragCharWidth,
|
||||||
|
width: fragCharWidth,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let merged = lineItems[0].normalizedText;
|
||||||
|
pushFragChars(lineItems[0]);
|
||||||
|
|
||||||
|
for (let index = 1; index < lineItems.length; index += 1) {
|
||||||
|
const previous = lineItems[index - 1];
|
||||||
|
const current = lineItems[index];
|
||||||
|
|
||||||
|
if (shouldInsertSpaceBetweenItems(previous, current)) {
|
||||||
|
const gap = current.rect.x - (previous.rect.x + previous.rect.width);
|
||||||
|
charMap.push({
|
||||||
|
x: previous.rect.x + previous.rect.width,
|
||||||
|
width: Math.max(gap, 1),
|
||||||
|
});
|
||||||
|
merged += ` ${current.normalizedText}`;
|
||||||
|
} else {
|
||||||
|
merged += current.normalizedText;
|
||||||
|
}
|
||||||
|
pushFragChars(current);
|
||||||
|
}
|
||||||
|
|
||||||
|
return { text: normalizeCompareText(merged), charMap };
|
||||||
|
}
|
||||||
|
|
||||||
|
function mergeWordTokenRects(
|
||||||
|
left: CompareWordToken,
|
||||||
|
right: CompareWordToken
|
||||||
|
): CompareWordToken {
|
||||||
|
const minX = Math.min(left.rect.x, right.rect.x);
|
||||||
|
const minY = Math.min(left.rect.y, right.rect.y);
|
||||||
|
const maxX = Math.max(
|
||||||
|
left.rect.x + left.rect.width,
|
||||||
|
right.rect.x + right.rect.width
|
||||||
|
);
|
||||||
|
const maxY = Math.max(
|
||||||
|
left.rect.y + left.rect.height,
|
||||||
|
right.rect.y + right.rect.height
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
word: `${left.word}${right.word}`,
|
||||||
|
compareWord: `${left.compareWord}${right.compareWord}`,
|
||||||
|
rect: {
|
||||||
|
x: minX,
|
||||||
|
y: minY,
|
||||||
|
width: maxX - minX,
|
||||||
|
height: maxY - minY,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildMergedWordTokens(lineItems: CompareTextItem[]) {
|
||||||
|
if (
|
||||||
|
!lineItems.some((item) => item.wordTokens && item.wordTokens.length > 0)
|
||||||
|
) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
const mergedTokens: CompareWordToken[] = [];
|
||||||
|
let previousItem: CompareTextItem | null = null;
|
||||||
|
|
||||||
|
for (const item of lineItems) {
|
||||||
|
const itemTokens =
|
||||||
|
item.wordTokens && item.wordTokens.length > 0
|
||||||
|
? item.wordTokens
|
||||||
|
: [
|
||||||
|
{
|
||||||
|
word: item.normalizedText,
|
||||||
|
compareWord: item.normalizedText.toLowerCase(),
|
||||||
|
rect: item.rect,
|
||||||
|
} satisfies CompareWordToken,
|
||||||
|
];
|
||||||
|
|
||||||
|
itemTokens.forEach((token, tokenIndex) => {
|
||||||
|
const joinsAcrossItems =
|
||||||
|
tokenIndex === 0 && previousItem
|
||||||
|
? !shouldInsertSpaceBetweenItems(previousItem, item)
|
||||||
|
: false;
|
||||||
|
const shouldJoin =
|
||||||
|
mergedTokens.length > 0 &&
|
||||||
|
(tokenIndex > 0 ? Boolean(token.joinsWithPrevious) : joinsAcrossItems);
|
||||||
|
|
||||||
|
if (shouldJoin) {
|
||||||
|
mergedTokens[mergedTokens.length - 1] = mergeWordTokenRects(
|
||||||
|
mergedTokens[mergedTokens.length - 1],
|
||||||
|
token
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
mergedTokens.push({
|
||||||
|
word: token.word,
|
||||||
|
compareWord: token.compareWord,
|
||||||
|
rect: token.rect,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
previousItem = item;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mergedTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function mergeIntoLines(
|
||||||
|
sortedItems: CompareTextItem[]
|
||||||
|
): CompareTextItem[] {
|
||||||
|
if (sortedItems.length === 0) return [];
|
||||||
|
|
||||||
|
const lines: CompareTextItem[][] = [];
|
||||||
|
let currentLine: CompareTextItem[] = [sortedItems[0]];
|
||||||
|
|
||||||
|
for (let i = 1; i < sortedItems.length; i++) {
|
||||||
|
const anchor = currentLine[0];
|
||||||
|
const curr = sortedItems[i];
|
||||||
|
const lineTolerance = Math.max(
|
||||||
|
Math.min(anchor.rect.height, curr.rect.height) * 0.6,
|
||||||
|
4
|
||||||
|
);
|
||||||
|
|
||||||
|
if (Math.abs(curr.rect.y - anchor.rect.y) <= lineTolerance) {
|
||||||
|
currentLine.push(curr);
|
||||||
|
} else {
|
||||||
|
lines.push(currentLine);
|
||||||
|
currentLine = [curr];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lines.push(currentLine);
|
||||||
|
|
||||||
|
return lines.map((lineItems, lineIndex) => {
|
||||||
|
const { text: normalizedText, charMap } = mergeLineText(lineItems);
|
||||||
|
|
||||||
|
const minX = Math.min(...lineItems.map((item) => item.rect.x));
|
||||||
|
const minY = Math.min(...lineItems.map((item) => item.rect.y));
|
||||||
|
const maxX = Math.max(
|
||||||
|
...lineItems.map((item) => item.rect.x + item.rect.width)
|
||||||
|
);
|
||||||
|
const maxY = Math.max(
|
||||||
|
...lineItems.map((item) => item.rect.y + item.rect.height)
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: `line-${lineIndex}`,
|
||||||
|
text: lineItems.map((item) => item.text).join(' '),
|
||||||
|
normalizedText,
|
||||||
|
rect: {
|
||||||
|
x: minX,
|
||||||
|
y: minY,
|
||||||
|
width: maxX - minX,
|
||||||
|
height: maxY - minY,
|
||||||
|
},
|
||||||
|
fragments: lineItems,
|
||||||
|
charMap,
|
||||||
|
wordTokens: buildMergedWordTokens(lineItems),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractPageModel(
|
||||||
|
page: pdfjsLib.PDFPageProxy,
|
||||||
|
viewport: pdfjsLib.PageViewport
|
||||||
|
): Promise<ComparePageModel> {
|
||||||
|
const textContent = await page.getTextContent({
|
||||||
|
disableCombineTextItems: true,
|
||||||
|
});
|
||||||
|
const styles = textContent.styles ?? {};
|
||||||
|
const rawItems = sortCompareTextItems(
|
||||||
|
textContent.items
|
||||||
|
.filter((item): item is PageTextItem => 'str' in item)
|
||||||
|
.map((item, index) => toRect(viewport, item, index, styles))
|
||||||
|
.filter((item) => item.normalizedText.length > 0)
|
||||||
|
);
|
||||||
|
const textItems = mergeIntoLines(rawItems);
|
||||||
|
|
||||||
|
return {
|
||||||
|
pageNumber: page.pageNumber,
|
||||||
|
width: viewport.width,
|
||||||
|
height: viewport.height,
|
||||||
|
textItems,
|
||||||
|
plainText: joinCompareTextItems(textItems),
|
||||||
|
hasText: textItems.length > 0,
|
||||||
|
source: 'pdfjs',
|
||||||
|
};
|
||||||
|
}
|
||||||
76
src/js/compare/engine/ocr-page.ts
Normal file
76
src/js/compare/engine/ocr-page.ts
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
import Tesseract from 'tesseract.js';
|
||||||
|
|
||||||
|
import type { ComparePageModel, CompareTextItem } from '../types.ts';
|
||||||
|
import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts';
|
||||||
|
import {
|
||||||
|
joinCompareTextItems,
|
||||||
|
normalizeCompareText,
|
||||||
|
} from './text-normalization.ts';
|
||||||
|
|
||||||
|
type OcrWord = {
|
||||||
|
text: string;
|
||||||
|
bbox: {
|
||||||
|
x0: number;
|
||||||
|
y0: number;
|
||||||
|
x1: number;
|
||||||
|
y1: number;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
export async function recognizePageCanvas(
|
||||||
|
canvas: HTMLCanvasElement,
|
||||||
|
language: string,
|
||||||
|
onProgress?: (status: string, progress: number) => void
|
||||||
|
): Promise<ComparePageModel> {
|
||||||
|
const result = await Tesseract.recognize(canvas, language, {
|
||||||
|
logger(message) {
|
||||||
|
onProgress?.(message.status, message.progress || 0);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const ocrData = result.data as unknown as { words?: OcrWord[] };
|
||||||
|
const words = ((ocrData.words || []) as OcrWord[])
|
||||||
|
.map((word, index) => {
|
||||||
|
const normalizedText = normalizeCompareText(word.text || '');
|
||||||
|
if (!normalizedText) return null;
|
||||||
|
|
||||||
|
const item: CompareTextItem = {
|
||||||
|
id: `ocr-${index}-${normalizedText}`,
|
||||||
|
text: word.text,
|
||||||
|
normalizedText,
|
||||||
|
rect: {
|
||||||
|
x: word.bbox.x0,
|
||||||
|
y: word.bbox.y0,
|
||||||
|
width: Math.max(word.bbox.x1 - word.bbox.x0, 1),
|
||||||
|
height: Math.max(word.bbox.y1 - word.bbox.y0, 1),
|
||||||
|
},
|
||||||
|
wordTokens: [
|
||||||
|
{
|
||||||
|
word: normalizedText,
|
||||||
|
compareWord: normalizedText.toLowerCase(),
|
||||||
|
rect: {
|
||||||
|
x: word.bbox.x0,
|
||||||
|
y: word.bbox.y0,
|
||||||
|
width: Math.max(word.bbox.x1 - word.bbox.x0, 1),
|
||||||
|
height: Math.max(word.bbox.y1 - word.bbox.y0, 1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
return item;
|
||||||
|
})
|
||||||
|
.filter((word): word is CompareTextItem => Boolean(word));
|
||||||
|
|
||||||
|
const mergedItems = mergeIntoLines(sortCompareTextItems(words));
|
||||||
|
|
||||||
|
return {
|
||||||
|
pageNumber: 0,
|
||||||
|
width: canvas.width,
|
||||||
|
height: canvas.height,
|
||||||
|
textItems: mergedItems,
|
||||||
|
plainText: joinCompareTextItems(mergedItems),
|
||||||
|
hasText: mergedItems.length > 0,
|
||||||
|
source: 'ocr',
|
||||||
|
};
|
||||||
|
}
|
||||||
61
src/js/compare/engine/page-signatures.ts
Normal file
61
src/js/compare/engine/page-signatures.ts
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
import * as pdfjsLib from 'pdfjs-dist';
|
||||||
|
|
||||||
|
import type { ComparePageSignature, CompareTextItem } from '../types.ts';
|
||||||
|
import {
|
||||||
|
joinNormalizedText,
|
||||||
|
normalizeCompareText,
|
||||||
|
} from './text-normalization.ts';
|
||||||
|
|
||||||
|
type SignatureTextItem = {
|
||||||
|
str: string;
|
||||||
|
dir: string;
|
||||||
|
transform: number[];
|
||||||
|
width: number;
|
||||||
|
height: number;
|
||||||
|
fontName: string;
|
||||||
|
hasEOL: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
function tokenToItem(token: string, index: number): CompareTextItem {
|
||||||
|
return {
|
||||||
|
id: `token-${index}-${token}`,
|
||||||
|
text: token,
|
||||||
|
normalizedText: token,
|
||||||
|
rect: { x: 0, y: 0, width: 0, height: 0 },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractPageSignature(
|
||||||
|
pdfDoc: pdfjsLib.PDFDocumentProxy,
|
||||||
|
pageNumber: number
|
||||||
|
): Promise<ComparePageSignature> {
|
||||||
|
const page = await pdfDoc.getPage(pageNumber);
|
||||||
|
const textContent = await page.getTextContent();
|
||||||
|
const tokens = textContent.items
|
||||||
|
.filter((item): item is SignatureTextItem => 'str' in item)
|
||||||
|
.map((item) => normalizeCompareText(item.str))
|
||||||
|
.filter(Boolean);
|
||||||
|
|
||||||
|
const limitedTokens = tokens.slice(0, 500);
|
||||||
|
|
||||||
|
return {
|
||||||
|
pageNumber,
|
||||||
|
plainText: joinNormalizedText(limitedTokens),
|
||||||
|
hasText: limitedTokens.length > 0,
|
||||||
|
tokenItems: limitedTokens.map((token, index) => tokenToItem(token, index)),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractDocumentSignatures(
|
||||||
|
pdfDoc: pdfjsLib.PDFDocumentProxy,
|
||||||
|
onProgress?: (pageNumber: number, totalPages: number) => void
|
||||||
|
) {
|
||||||
|
const signatures: ComparePageSignature[] = [];
|
||||||
|
|
||||||
|
for (let pageNumber = 1; pageNumber <= pdfDoc.numPages; pageNumber += 1) {
|
||||||
|
onProgress?.(pageNumber, pdfDoc.numPages);
|
||||||
|
signatures.push(await extractPageSignature(pdfDoc, pageNumber));
|
||||||
|
}
|
||||||
|
|
||||||
|
return signatures;
|
||||||
|
}
|
||||||
122
src/js/compare/engine/pair-pages.ts
Normal file
122
src/js/compare/engine/pair-pages.ts
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
import type { ComparePagePair, ComparePageSignature } from '../types.ts';
|
||||||
|
|
||||||
|
function tokenize(text: string) {
|
||||||
|
return new Set(text.split(/\s+/).filter(Boolean));
|
||||||
|
}
|
||||||
|
|
||||||
|
function similarityScore(
|
||||||
|
left: ComparePageSignature,
|
||||||
|
right: ComparePageSignature
|
||||||
|
) {
|
||||||
|
if (!left.hasText && !right.hasText) {
|
||||||
|
return left.pageNumber === right.pageNumber ? 0.7 : 0.35;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!left.hasText || !right.hasText) {
|
||||||
|
return 0.08;
|
||||||
|
}
|
||||||
|
|
||||||
|
const leftTokens = tokenize(left.plainText);
|
||||||
|
const rightTokens = tokenize(right.plainText);
|
||||||
|
const union = new Set([...leftTokens, ...rightTokens]);
|
||||||
|
let intersectionCount = 0;
|
||||||
|
|
||||||
|
leftTokens.forEach((token) => {
|
||||||
|
if (rightTokens.has(token)) intersectionCount += 1;
|
||||||
|
});
|
||||||
|
|
||||||
|
const jaccard = union.size === 0 ? 0 : intersectionCount / union.size;
|
||||||
|
const positionalBias = left.pageNumber === right.pageNumber ? 0.1 : 0;
|
||||||
|
return Math.min(jaccard + positionalBias, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function pairPages(
|
||||||
|
leftPages: ComparePageSignature[],
|
||||||
|
rightPages: ComparePageSignature[]
|
||||||
|
) {
|
||||||
|
const insertionCost = 0.8;
|
||||||
|
const rowCount = leftPages.length + 1;
|
||||||
|
const colCount = rightPages.length + 1;
|
||||||
|
const dp = Array.from({ length: rowCount }, () =>
|
||||||
|
Array<number>(colCount).fill(0)
|
||||||
|
);
|
||||||
|
const backtrack = Array.from({ length: rowCount }, () =>
|
||||||
|
Array<'match' | 'left' | 'right'>(colCount).fill('match')
|
||||||
|
);
|
||||||
|
|
||||||
|
for (let i = 1; i < rowCount; i += 1) {
|
||||||
|
dp[i][0] = i * insertionCost;
|
||||||
|
backtrack[i][0] = 'left';
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let j = 1; j < colCount; j += 1) {
|
||||||
|
dp[0][j] = j * insertionCost;
|
||||||
|
backtrack[0][j] = 'right';
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let i = 1; i < rowCount; i += 1) {
|
||||||
|
for (let j = 1; j < colCount; j += 1) {
|
||||||
|
const similarity = similarityScore(leftPages[i - 1], rightPages[j - 1]);
|
||||||
|
const matchCost = dp[i - 1][j - 1] + (1 - similarity);
|
||||||
|
const leftCost = dp[i - 1][j] + insertionCost;
|
||||||
|
const rightCost = dp[i][j - 1] + insertionCost;
|
||||||
|
|
||||||
|
const minCost = Math.min(matchCost, leftCost, rightCost);
|
||||||
|
dp[i][j] = minCost;
|
||||||
|
|
||||||
|
if (minCost === matchCost) {
|
||||||
|
backtrack[i][j] = 'match';
|
||||||
|
} else if (minCost === leftCost) {
|
||||||
|
backtrack[i][j] = 'left';
|
||||||
|
} else {
|
||||||
|
backtrack[i][j] = 'right';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const pairs: ComparePagePair[] = [];
|
||||||
|
let i = leftPages.length;
|
||||||
|
let j = rightPages.length;
|
||||||
|
|
||||||
|
while (i > 0 || j > 0) {
|
||||||
|
const direction = backtrack[i][j];
|
||||||
|
|
||||||
|
if (i > 0 && j > 0 && direction === 'match') {
|
||||||
|
const confidence = similarityScore(leftPages[i - 1], rightPages[j - 1]);
|
||||||
|
pairs.push({
|
||||||
|
pairIndex: 0,
|
||||||
|
leftPageNumber: leftPages[i - 1].pageNumber,
|
||||||
|
rightPageNumber: rightPages[j - 1].pageNumber,
|
||||||
|
confidence,
|
||||||
|
});
|
||||||
|
i -= 1;
|
||||||
|
j -= 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i > 0 && (j === 0 || direction === 'left')) {
|
||||||
|
pairs.push({
|
||||||
|
pairIndex: 0,
|
||||||
|
leftPageNumber: leftPages[i - 1].pageNumber,
|
||||||
|
rightPageNumber: null,
|
||||||
|
confidence: 0,
|
||||||
|
});
|
||||||
|
i -= 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (j > 0) {
|
||||||
|
pairs.push({
|
||||||
|
pairIndex: 0,
|
||||||
|
leftPageNumber: null,
|
||||||
|
rightPageNumber: rightPages[j - 1].pageNumber,
|
||||||
|
confidence: 0,
|
||||||
|
});
|
||||||
|
j -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pairs
|
||||||
|
.reverse()
|
||||||
|
.map((pair, index) => ({ ...pair, pairIndex: index + 1 }));
|
||||||
|
}
|
||||||
64
src/js/compare/engine/text-normalization.ts
Normal file
64
src/js/compare/engine/text-normalization.ts
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import type { CompareTextItem } from '../types.ts';
|
||||||
|
|
||||||
|
export function normalizeCompareText(text: string) {
|
||||||
|
return text
|
||||||
|
.normalize('NFKC')
|
||||||
|
.replace(/[\u0000-\u001F\u007F-\u009F]/g, ' ')
|
||||||
|
.replace(/[\u{E000}-\u{F8FF}]/gu, ' ')
|
||||||
|
.replace(/\s+/g, ' ')
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function shouldAppendWithoutSpace(current: string, next: string) {
|
||||||
|
if (!current) return true;
|
||||||
|
if (/^[,.;:!?%)\]}]/.test(next)) return true;
|
||||||
|
if (/^["']$/.test(next)) return true;
|
||||||
|
if (/^['’”]/u.test(next)) return true;
|
||||||
|
if (/[([{/"'“‘-]$/u.test(current)) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function joinNormalizedText(tokens: string[]) {
|
||||||
|
return tokens.reduce((result, token) => {
|
||||||
|
if (!token) return result;
|
||||||
|
if (shouldAppendWithoutSpace(result, token)) {
|
||||||
|
return `${result}${token}`;
|
||||||
|
}
|
||||||
|
return `${result} ${token}`;
|
||||||
|
}, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
export function joinCompareTextItems(items: CompareTextItem[]) {
|
||||||
|
return joinNormalizedText(items.map((item) => item.normalizedText));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isLowQualityExtractedText(text: string) {
|
||||||
|
const normalized = normalizeCompareText(text);
|
||||||
|
if (!normalized) return true;
|
||||||
|
|
||||||
|
const tokens = normalized.split(/\s+/).filter(Boolean);
|
||||||
|
const visibleCharacters = Array.from(normalized).filter(
|
||||||
|
(character) => character.trim().length > 0
|
||||||
|
);
|
||||||
|
const alphaNumericCount = visibleCharacters.filter((character) =>
|
||||||
|
/[\p{L}\p{N}]/u.test(character)
|
||||||
|
).length;
|
||||||
|
const symbolCount = visibleCharacters.length - alphaNumericCount;
|
||||||
|
const tokenWithAlphaNumericCount = tokens.filter((token) =>
|
||||||
|
/[\p{L}\p{N}]/u.test(token)
|
||||||
|
).length;
|
||||||
|
|
||||||
|
if (alphaNumericCount === 0) return true;
|
||||||
|
if (
|
||||||
|
visibleCharacters.length >= 12 &&
|
||||||
|
alphaNumericCount / visibleCharacters.length < 0.45 &&
|
||||||
|
symbolCount / visibleCharacters.length > 0.35
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (tokens.length >= 6 && tokenWithAlphaNumericCount / tokens.length < 0.6) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
134
src/js/compare/engine/visual-diff.ts
Normal file
134
src/js/compare/engine/visual-diff.ts
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
import pixelmatch from 'pixelmatch';
|
||||||
|
|
||||||
|
import type { CompareVisualDiff } from '../types.ts';
|
||||||
|
|
||||||
|
type FocusRegion = {
|
||||||
|
x: number;
|
||||||
|
y: number;
|
||||||
|
width: number;
|
||||||
|
height: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
function createCanvas(width: number, height: number) {
|
||||||
|
const canvas = document.createElement('canvas');
|
||||||
|
canvas.width = width;
|
||||||
|
canvas.height = height;
|
||||||
|
return canvas;
|
||||||
|
}
|
||||||
|
|
||||||
|
function drawNormalized(
|
||||||
|
sourceCanvas: HTMLCanvasElement,
|
||||||
|
targetCanvas: HTMLCanvasElement
|
||||||
|
) {
|
||||||
|
const context = targetCanvas.getContext('2d');
|
||||||
|
if (!context) {
|
||||||
|
throw new Error('Could not create comparison canvas context.');
|
||||||
|
}
|
||||||
|
|
||||||
|
context.fillStyle = '#ffffff';
|
||||||
|
context.fillRect(0, 0, targetCanvas.width, targetCanvas.height);
|
||||||
|
|
||||||
|
const offsetX = Math.floor((targetCanvas.width - sourceCanvas.width) / 2);
|
||||||
|
const offsetY = Math.floor((targetCanvas.height - sourceCanvas.height) / 2);
|
||||||
|
context.drawImage(sourceCanvas, offsetX, offsetY);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function renderVisualDiff(
|
||||||
|
canvas1: HTMLCanvasElement,
|
||||||
|
canvas2: HTMLCanvasElement,
|
||||||
|
outputCanvas: HTMLCanvasElement,
|
||||||
|
focusRegion?: FocusRegion
|
||||||
|
): CompareVisualDiff {
|
||||||
|
const width = Math.max(canvas1.width, canvas2.width, 1);
|
||||||
|
const height = Math.max(canvas1.height, canvas2.height, 1);
|
||||||
|
const normalizedCanvas1 = createCanvas(width, height);
|
||||||
|
const normalizedCanvas2 = createCanvas(width, height);
|
||||||
|
|
||||||
|
drawNormalized(canvas1, normalizedCanvas1);
|
||||||
|
drawNormalized(canvas2, normalizedCanvas2);
|
||||||
|
|
||||||
|
outputCanvas.width = width;
|
||||||
|
outputCanvas.height = height;
|
||||||
|
|
||||||
|
const context1 = normalizedCanvas1.getContext('2d');
|
||||||
|
const context2 = normalizedCanvas2.getContext('2d');
|
||||||
|
const outputContext = outputCanvas.getContext('2d');
|
||||||
|
|
||||||
|
if (!context1 || !context2 || !outputContext) {
|
||||||
|
throw new Error('Could not create visual diff context.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const image1 = context1.getImageData(0, 0, width, height);
|
||||||
|
const image2 = context2.getImageData(0, 0, width, height);
|
||||||
|
const diffImage = outputContext.createImageData(width, height);
|
||||||
|
|
||||||
|
const mismatchPixels = pixelmatch(
|
||||||
|
image1.data,
|
||||||
|
image2.data,
|
||||||
|
diffImage.data,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
{
|
||||||
|
threshold: 0.12,
|
||||||
|
includeAA: false,
|
||||||
|
alpha: 0.2,
|
||||||
|
diffMask: false,
|
||||||
|
diffColor: [239, 68, 68],
|
||||||
|
diffColorAlt: [34, 197, 94],
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
const overlayCanvas = createCanvas(width, height);
|
||||||
|
const overlayContext = overlayCanvas.getContext('2d');
|
||||||
|
|
||||||
|
if (!overlayContext) {
|
||||||
|
throw new Error('Could not create visual diff overlay context.');
|
||||||
|
}
|
||||||
|
|
||||||
|
overlayContext.putImageData(diffImage, 0, 0);
|
||||||
|
|
||||||
|
const region = focusRegion
|
||||||
|
? {
|
||||||
|
x: Math.max(Math.floor(focusRegion.x), 0),
|
||||||
|
y: Math.max(Math.floor(focusRegion.y), 0),
|
||||||
|
width: Math.min(Math.ceil(focusRegion.width), width),
|
||||||
|
height: Math.min(Math.ceil(focusRegion.height), height),
|
||||||
|
}
|
||||||
|
: { x: 0, y: 0, width, height };
|
||||||
|
|
||||||
|
outputCanvas.width = Math.max(region.width, 1);
|
||||||
|
outputCanvas.height = Math.max(region.height, 1);
|
||||||
|
|
||||||
|
outputContext.fillStyle = '#ffffff';
|
||||||
|
outputContext.fillRect(0, 0, outputCanvas.width, outputCanvas.height);
|
||||||
|
outputContext.drawImage(
|
||||||
|
normalizedCanvas2,
|
||||||
|
region.x,
|
||||||
|
region.y,
|
||||||
|
region.width,
|
||||||
|
region.height,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
outputCanvas.width,
|
||||||
|
outputCanvas.height
|
||||||
|
);
|
||||||
|
outputContext.globalAlpha = 0.9;
|
||||||
|
outputContext.drawImage(
|
||||||
|
overlayCanvas,
|
||||||
|
region.x,
|
||||||
|
region.y,
|
||||||
|
region.width,
|
||||||
|
region.height,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
outputCanvas.width,
|
||||||
|
outputCanvas.height
|
||||||
|
);
|
||||||
|
outputContext.globalAlpha = 1;
|
||||||
|
|
||||||
|
return {
|
||||||
|
mismatchPixels,
|
||||||
|
mismatchRatio: mismatchPixels / Math.max(width * height, 1),
|
||||||
|
hasDiff: mismatchPixels > 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
77
src/js/compare/reporting/build-report.ts
Normal file
77
src/js/compare/reporting/build-report.ts
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
import type { ComparePagePair, ComparePageResult } from '../types.ts';
|
||||||
|
|
||||||
|
function escapeHtml(text: string) {
|
||||||
|
return text
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, ''');
|
||||||
|
}
|
||||||
|
|
||||||
|
export function buildCompareReport(
|
||||||
|
fileName1: string,
|
||||||
|
fileName2: string,
|
||||||
|
pairs: ComparePagePair[],
|
||||||
|
results: ComparePageResult[]
|
||||||
|
) {
|
||||||
|
const totals = results.reduce(
|
||||||
|
(summary, result) => {
|
||||||
|
summary.added += result.summary.added;
|
||||||
|
summary.removed += result.summary.removed;
|
||||||
|
summary.modified += result.summary.modified;
|
||||||
|
return summary;
|
||||||
|
},
|
||||||
|
{ added: 0, removed: 0, modified: 0 }
|
||||||
|
);
|
||||||
|
|
||||||
|
const rows = results
|
||||||
|
.map((result, index) => {
|
||||||
|
const pair = pairs[index];
|
||||||
|
const changes = result.changes
|
||||||
|
.map(
|
||||||
|
(change) =>
|
||||||
|
`<li><strong>${escapeHtml(change.type)}</strong>: ${escapeHtml(change.description)}</li>`
|
||||||
|
)
|
||||||
|
.join('');
|
||||||
|
|
||||||
|
return `
|
||||||
|
<section class="pair-card">
|
||||||
|
<h2>Comparison ${pair?.pairIndex || index + 1}</h2>
|
||||||
|
<p class="meta">PDF 1 page: ${pair?.leftPageNumber ?? 'none'} | PDF 2 page: ${pair?.rightPageNumber ?? 'none'} | Confidence: ${((pair?.confidence || 0) * 100).toFixed(0)}%</p>
|
||||||
|
<p class="meta">Status: ${escapeHtml(result.status)}${result.usedOcr ? ' | OCR used' : ''}</p>
|
||||||
|
<p class="meta">Added: ${result.summary.added} | Removed: ${result.summary.removed} | Modified: ${result.summary.modified}</p>
|
||||||
|
<ul>${changes || '<li>No semantic changes detected.</li>'}</ul>
|
||||||
|
</section>
|
||||||
|
`;
|
||||||
|
})
|
||||||
|
.join('');
|
||||||
|
|
||||||
|
return `<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<title>Compare report</title>
|
||||||
|
<style>
|
||||||
|
body { font-family: ui-sans-serif, system-ui, sans-serif; margin: 0; padding: 2rem; background: #111827; color: #e5e7eb; }
|
||||||
|
.summary { display: grid; grid-template-columns: repeat(3, minmax(0, 1fr)); gap: 1rem; margin: 1.5rem 0; }
|
||||||
|
.card, .pair-card { background: #1f2937; border: 1px solid #374151; border-radius: 12px; padding: 1rem 1.25rem; }
|
||||||
|
.pair-card { margin-bottom: 1rem; }
|
||||||
|
.meta { color: #9ca3af; font-size: 0.95rem; }
|
||||||
|
h1, h2 { margin: 0 0 0.75rem 0; }
|
||||||
|
ul { margin: 0.75rem 0 0 1.25rem; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>PDF Compare Report</h1>
|
||||||
|
<p class="meta">PDF 1: ${escapeHtml(fileName1)} | PDF 2: ${escapeHtml(fileName2)}</p>
|
||||||
|
<div class="summary">
|
||||||
|
<div class="card"><div class="meta">Added</div><div>${totals.added}</div></div>
|
||||||
|
<div class="card"><div class="meta">Removed</div><div>${totals.removed}</div></div>
|
||||||
|
<div class="card"><div class="meta">Modified</div><div>${totals.modified}</div></div>
|
||||||
|
</div>
|
||||||
|
${rows}
|
||||||
|
</body>
|
||||||
|
</html>`;
|
||||||
|
}
|
||||||
18
src/js/compare/reporting/export-html-report.ts
Normal file
18
src/js/compare/reporting/export-html-report.ts
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import { buildCompareReport } from './build-report.ts';
|
||||||
|
import type { ComparePagePair, ComparePageResult } from '../types.ts';
|
||||||
|
|
||||||
|
export function exportCompareHtmlReport(
|
||||||
|
fileName1: string,
|
||||||
|
fileName2: string,
|
||||||
|
pairs: ComparePagePair[],
|
||||||
|
results: ComparePageResult[]
|
||||||
|
) {
|
||||||
|
const html = buildCompareReport(fileName1, fileName2, pairs, results);
|
||||||
|
const blob = new Blob([html], { type: 'text/html;charset=utf-8' });
|
||||||
|
const url = URL.createObjectURL(blob);
|
||||||
|
const anchor = document.createElement('a');
|
||||||
|
anchor.href = url;
|
||||||
|
anchor.download = 'bentopdf-compare-report.html';
|
||||||
|
anchor.click();
|
||||||
|
URL.revokeObjectURL(url);
|
||||||
|
}
|
||||||
113
src/js/compare/types.ts
Normal file
113
src/js/compare/types.ts
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
import type * as pdfjsLib from 'pdfjs-dist';
|
||||||
|
|
||||||
|
export type CompareViewMode = 'overlay' | 'side-by-side';
|
||||||
|
|
||||||
|
export interface CompareRectangle {
|
||||||
|
x: number;
|
||||||
|
y: number;
|
||||||
|
width: number;
|
||||||
|
height: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CharPosition {
|
||||||
|
x: number;
|
||||||
|
width: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CompareWordToken {
|
||||||
|
word: string;
|
||||||
|
compareWord: string;
|
||||||
|
rect: CompareRectangle;
|
||||||
|
joinsWithPrevious?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CompareTextItem {
|
||||||
|
id: string;
|
||||||
|
text: string;
|
||||||
|
normalizedText: string;
|
||||||
|
rect: CompareRectangle;
|
||||||
|
fragments?: CompareTextItem[];
|
||||||
|
charMap?: CharPosition[];
|
||||||
|
wordTokens?: CompareWordToken[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ComparePageModel {
|
||||||
|
pageNumber: number;
|
||||||
|
width: number;
|
||||||
|
height: number;
|
||||||
|
textItems: CompareTextItem[];
|
||||||
|
plainText: string;
|
||||||
|
hasText: boolean;
|
||||||
|
source: 'pdfjs' | 'ocr';
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ComparePageSignature {
|
||||||
|
pageNumber: number;
|
||||||
|
plainText: string;
|
||||||
|
hasText: boolean;
|
||||||
|
tokenItems: CompareTextItem[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ComparePagePair {
|
||||||
|
pairIndex: number;
|
||||||
|
leftPageNumber: number | null;
|
||||||
|
rightPageNumber: number | null;
|
||||||
|
confidence: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CompareVisualDiff {
|
||||||
|
mismatchPixels: number;
|
||||||
|
mismatchRatio: number;
|
||||||
|
hasDiff: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export type CompareChangeType =
|
||||||
|
| 'added'
|
||||||
|
| 'removed'
|
||||||
|
| 'modified'
|
||||||
|
| 'page-added'
|
||||||
|
| 'page-removed';
|
||||||
|
|
||||||
|
export interface CompareTextChange {
|
||||||
|
id: string;
|
||||||
|
type: CompareChangeType;
|
||||||
|
description: string;
|
||||||
|
beforeText: string;
|
||||||
|
afterText: string;
|
||||||
|
beforeRects: CompareRectangle[];
|
||||||
|
afterRects: CompareRectangle[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CompareChangeSummary {
|
||||||
|
added: number;
|
||||||
|
removed: number;
|
||||||
|
modified: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ComparePageResult {
|
||||||
|
status: 'match' | 'changed' | 'left-only' | 'right-only';
|
||||||
|
leftPageNumber: number | null;
|
||||||
|
rightPageNumber: number | null;
|
||||||
|
changes: CompareTextChange[];
|
||||||
|
summary: CompareChangeSummary;
|
||||||
|
visualDiff: CompareVisualDiff | null;
|
||||||
|
confidence?: number;
|
||||||
|
usedOcr?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export type CompareFilterType = 'added' | 'removed' | 'modified' | 'all';
|
||||||
|
|
||||||
|
export interface CompareState {
|
||||||
|
pdfDoc1: pdfjsLib.PDFDocumentProxy | null;
|
||||||
|
pdfDoc2: pdfjsLib.PDFDocumentProxy | null;
|
||||||
|
currentPage: number;
|
||||||
|
viewMode: CompareViewMode;
|
||||||
|
isSyncScroll: boolean;
|
||||||
|
currentComparison: ComparePageResult | null;
|
||||||
|
activeChangeIndex: number;
|
||||||
|
pagePairs: ComparePagePair[];
|
||||||
|
activeFilter: CompareFilterType;
|
||||||
|
changeSearchQuery: string;
|
||||||
|
useOcr: boolean;
|
||||||
|
ocrLanguage: string;
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,9 +1 @@
|
|||||||
import * as pdfjsLib from 'pdfjs-dist';
|
export type { CompareState } from '../compare/types.ts';
|
||||||
|
|
||||||
export interface CompareState {
|
|
||||||
pdfDoc1: pdfjsLib.PDFDocumentProxy | null;
|
|
||||||
pdfDoc2: pdfjsLib.PDFDocumentProxy | null;
|
|
||||||
currentPage: number;
|
|
||||||
viewMode: 'overlay' | 'side-by-side';
|
|
||||||
isSyncScroll: boolean;
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -72,31 +72,362 @@
|
|||||||
<style>
|
<style>
|
||||||
.compare-viewer-wrapper.overlay-mode {
|
.compare-viewer-wrapper.overlay-mode {
|
||||||
position: relative;
|
position: relative;
|
||||||
|
background: #ffffff;
|
||||||
|
overflow: hidden;
|
||||||
|
padding: 1.5rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
.compare-viewer-wrapper.overlay-mode #panel-1,
|
.compare-viewer-wrapper.overlay-mode #panel-1 {
|
||||||
.compare-viewer-wrapper.overlay-mode #panel-2 {
|
|
||||||
position: absolute;
|
|
||||||
top: 0;
|
|
||||||
left: 0;
|
|
||||||
width: 100%;
|
width: 100%;
|
||||||
height: 100%;
|
height: 100%;
|
||||||
|
overflow: auto;
|
||||||
|
scrollbar-width: none;
|
||||||
|
-ms-overflow-style: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-viewer-wrapper.overlay-mode #panel-1::-webkit-scrollbar {
|
||||||
|
display: none;
|
||||||
}
|
}
|
||||||
|
|
||||||
.compare-viewer-wrapper.overlay-mode #panel-2 {
|
.compare-viewer-wrapper.overlay-mode #panel-2 {
|
||||||
|
position: absolute;
|
||||||
|
inset: 1.5rem;
|
||||||
|
overflow: hidden;
|
||||||
pointer-events: none;
|
pointer-events: none;
|
||||||
|
background: transparent;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-viewer-wrapper.overlay-mode .compare-panel-label {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-viewer-wrapper.overlay-mode .compare-canvas-stage canvas {
|
||||||
|
position: static;
|
||||||
|
top: auto;
|
||||||
|
left: auto;
|
||||||
|
width: auto;
|
||||||
|
height: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-viewer-wrapper.overlay-mode
|
||||||
|
#panel-2
|
||||||
|
.compare-canvas-stage
|
||||||
|
canvas {
|
||||||
|
background: transparent;
|
||||||
}
|
}
|
||||||
|
|
||||||
.compare-viewer-wrapper.side-by-side-mode {
|
.compare-viewer-wrapper.side-by-side-mode {
|
||||||
display: flex;
|
display: flex;
|
||||||
|
gap: 0;
|
||||||
|
padding: 0;
|
||||||
|
background: #ffffff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-workspace {
|
||||||
|
display: grid;
|
||||||
gap: 1rem;
|
gap: 1rem;
|
||||||
|
grid-template-columns: minmax(0, 1fr) 20rem;
|
||||||
|
align-items: stretch;
|
||||||
}
|
}
|
||||||
|
|
||||||
.compare-viewer-wrapper.side-by-side-mode #panel-1,
|
.compare-viewer-wrapper.side-by-side-mode #panel-1,
|
||||||
.compare-viewer-wrapper.side-by-side-mode #panel-2 {
|
.compare-viewer-wrapper.side-by-side-mode #panel-2 {
|
||||||
flex: 1;
|
flex: 1;
|
||||||
overflow: auto;
|
overflow: auto;
|
||||||
max-height: 70vh;
|
min-height: 0;
|
||||||
|
height: 100%;
|
||||||
|
border: none;
|
||||||
|
border-radius: 0;
|
||||||
|
box-shadow: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-viewer-wrapper.side-by-side-mode #panel-1 {
|
||||||
|
border-right: 2px solid #cbd5e1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-panel {
|
||||||
|
position: relative;
|
||||||
|
min-width: 0;
|
||||||
|
min-height: 0;
|
||||||
|
background: #ffffff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-panel-label {
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
z-index: 5;
|
||||||
|
padding: 0.5rem 1rem;
|
||||||
|
font-size: 0.75rem;
|
||||||
|
font-weight: 600;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.05em;
|
||||||
|
color: #64748b;
|
||||||
|
background: rgba(255, 255, 255, 0.95);
|
||||||
|
border-bottom: 1px solid #e2e8f0;
|
||||||
|
backdrop-filter: blur(4px);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-canvas-stage {
|
||||||
|
--compare-stage-pad-top: 1.5rem;
|
||||||
|
--compare-stage-pad-x: 1.75rem;
|
||||||
|
--compare-stage-pad-bottom: 1.75rem;
|
||||||
|
position: relative;
|
||||||
|
width: max-content;
|
||||||
|
margin: 0 auto;
|
||||||
|
padding: var(--compare-stage-pad-top) var(--compare-stage-pad-x)
|
||||||
|
var(--compare-stage-pad-bottom);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-canvas-stage canvas {
|
||||||
|
display: block;
|
||||||
|
background: #ffffff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-highlight-layer {
|
||||||
|
position: absolute;
|
||||||
|
inset: var(--compare-stage-pad-top) var(--compare-stage-pad-x)
|
||||||
|
var(--compare-stage-pad-bottom);
|
||||||
|
pointer-events: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-highlight {
|
||||||
|
position: absolute;
|
||||||
|
border-radius: 2px;
|
||||||
|
border: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-highlight.added {
|
||||||
|
background: rgba(34, 197, 94, 0.28);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-highlight.removed,
|
||||||
|
.compare-highlight.page-removed {
|
||||||
|
background: rgba(239, 68, 68, 0.28);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-highlight.modified {
|
||||||
|
background: rgba(245, 158, 11, 0.28);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-highlight.active {
|
||||||
|
outline: 2px solid rgba(99, 102, 241, 0.7);
|
||||||
|
outline-offset: 1px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-placeholder {
|
||||||
|
position: absolute;
|
||||||
|
inset: 2rem;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
text-align: center;
|
||||||
|
padding: 1rem;
|
||||||
|
border: 1px dashed #94a3b8;
|
||||||
|
border-radius: 0.75rem;
|
||||||
|
color: #475569;
|
||||||
|
background: rgba(255, 255, 255, 0.92);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-placeholder.hidden {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-item.active {
|
||||||
|
border-color: #818cf8;
|
||||||
|
background: rgba(79, 70, 229, 0.12);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-sidebar {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 0;
|
||||||
|
background: rgba(15, 23, 42, 0.6);
|
||||||
|
border: 1px solid rgba(51, 65, 85, 0.5);
|
||||||
|
border-radius: 0.75rem;
|
||||||
|
overflow: hidden;
|
||||||
|
height: clamp(36rem, 82vh, 72rem);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-sidebar-header {
|
||||||
|
padding: 0.75rem 1rem;
|
||||||
|
border-bottom: 1px solid rgba(51, 65, 85, 0.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-sidebar-filters {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.375rem;
|
||||||
|
padding: 0.625rem 1rem;
|
||||||
|
border-bottom: 1px solid rgba(51, 65, 85, 0.4);
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-pill {
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.25rem;
|
||||||
|
border-radius: 9999px;
|
||||||
|
padding: 0.25rem 0.625rem;
|
||||||
|
font-size: 0.6875rem;
|
||||||
|
font-weight: 600;
|
||||||
|
border: 1px solid transparent;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: all 0.15s;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-pill.removed {
|
||||||
|
color: #fca5a5;
|
||||||
|
background: rgba(239, 68, 68, 0.1);
|
||||||
|
border-color: rgba(239, 68, 68, 0.15);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-pill.removed.active {
|
||||||
|
background: rgba(239, 68, 68, 0.25);
|
||||||
|
border-color: rgba(239, 68, 68, 0.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-pill.modified {
|
||||||
|
color: #fcd34d;
|
||||||
|
background: rgba(245, 158, 11, 0.1);
|
||||||
|
border-color: rgba(245, 158, 11, 0.15);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-pill.modified.active {
|
||||||
|
background: rgba(245, 158, 11, 0.25);
|
||||||
|
border-color: rgba(245, 158, 11, 0.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-pill.added {
|
||||||
|
color: #86efac;
|
||||||
|
background: rgba(34, 197, 94, 0.1);
|
||||||
|
border-color: rgba(34, 197, 94, 0.15);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-pill.added.active {
|
||||||
|
background: rgba(34, 197, 94, 0.25);
|
||||||
|
border-color: rgba(34, 197, 94, 0.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-list {
|
||||||
|
flex: 1;
|
||||||
|
min-height: 0;
|
||||||
|
overflow-y: auto;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 1rem;
|
||||||
|
padding: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-item {
|
||||||
|
display: flex;
|
||||||
|
align-items: flex-start;
|
||||||
|
gap: 0.625rem;
|
||||||
|
padding: 0.75rem 1rem;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: background 0.1s;
|
||||||
|
border: 1px solid rgba(51, 65, 85, 0.3);
|
||||||
|
border-left: 2px solid transparent;
|
||||||
|
border-radius: 0.5rem;
|
||||||
|
font-size: 0.8125rem;
|
||||||
|
color: #cbd5e1;
|
||||||
|
line-height: 1.4;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-item:hover {
|
||||||
|
background: rgba(99, 102, 241, 0.08);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-item.active {
|
||||||
|
background: rgba(99, 102, 241, 0.15);
|
||||||
|
border-left: 2px solid #818cf8;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-dot {
|
||||||
|
width: 0.5rem;
|
||||||
|
height: 0.5rem;
|
||||||
|
border-radius: 50%;
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-item .compare-change-dot {
|
||||||
|
margin-top: 0.35rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-dot.added {
|
||||||
|
background: #22c55e;
|
||||||
|
}
|
||||||
|
.compare-change-dot.removed {
|
||||||
|
background: #ef4444;
|
||||||
|
}
|
||||||
|
.compare-change-dot.modified {
|
||||||
|
background: #f59e0b;
|
||||||
|
}
|
||||||
|
.compare-change-dot.page-added {
|
||||||
|
background: #22c55e;
|
||||||
|
}
|
||||||
|
.compare-change-dot.page-removed {
|
||||||
|
background: #ef4444;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-desc {
|
||||||
|
flex: 1;
|
||||||
|
min-width: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-desc-text {
|
||||||
|
white-space: normal;
|
||||||
|
overflow-wrap: anywhere;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-type {
|
||||||
|
font-size: 0.625rem;
|
||||||
|
font-weight: 600;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.04em;
|
||||||
|
flex-shrink: 0;
|
||||||
|
margin-top: 0.2rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-type.added,
|
||||||
|
.compare-change-type.page-added {
|
||||||
|
color: #86efac;
|
||||||
|
}
|
||||||
|
.compare-change-type.removed,
|
||||||
|
.compare-change-type.page-removed {
|
||||||
|
color: #fca5a5;
|
||||||
|
}
|
||||||
|
.compare-change-type.modified {
|
||||||
|
color: #fcd34d;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-change-empty {
|
||||||
|
padding: 2rem 1rem;
|
||||||
|
font-size: 0.8125rem;
|
||||||
|
color: #64748b;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 1023px) {
|
||||||
|
.compare-workspace {
|
||||||
|
grid-template-columns: minmax(0, 1fr);
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-sidebar {
|
||||||
|
height: auto;
|
||||||
|
max-height: 20rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-viewer-wrapper.side-by-side-mode {
|
||||||
|
gap: 1rem;
|
||||||
|
padding: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.compare-canvas-stage {
|
||||||
|
--compare-stage-pad-top: 1rem;
|
||||||
|
--compare-stage-pad-x: 1rem;
|
||||||
|
--compare-stage-pad-bottom: 1rem;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
</style>
|
</style>
|
||||||
|
|
||||||
@@ -134,7 +465,7 @@
|
|||||||
>
|
>
|
||||||
<div
|
<div
|
||||||
id="tool-uploader"
|
id="tool-uploader"
|
||||||
class="bg-gray-800 rounded-xl shadow-xl px-4 py-8 md:p-8 max-w-5xl w-full text-gray-200 border border-gray-700"
|
class="bg-gray-800 rounded-xl shadow-xl px-4 py-8 md:p-8 max-w-[96rem] w-full text-gray-200 border border-gray-700"
|
||||||
>
|
>
|
||||||
<button
|
<button
|
||||||
id="back-to-tools"
|
id="back-to-tools"
|
||||||
@@ -200,69 +531,76 @@
|
|||||||
|
|
||||||
<!-- Compare Viewer (hidden until both files loaded) -->
|
<!-- Compare Viewer (hidden until both files loaded) -->
|
||||||
<div id="compare-viewer" class="hidden">
|
<div id="compare-viewer" class="hidden">
|
||||||
<!-- Unified Toolbar -->
|
<!-- Toolbar -->
|
||||||
<div
|
<div
|
||||||
class="flex flex-wrap items-center justify-center gap-4 mb-4 p-3 bg-gray-900 rounded-lg border border-gray-700"
|
class="flex flex-wrap items-center gap-3 mb-3 p-2 bg-gray-900 rounded-lg border border-gray-700"
|
||||||
>
|
>
|
||||||
<!-- Page Navigation -->
|
|
||||||
<button
|
<button
|
||||||
id="prev-page-compare"
|
id="prev-page-compare"
|
||||||
class="btn p-2 rounded-full bg-gray-700 hover:bg-gray-600 disabled:opacity-50"
|
class="btn p-1.5 rounded bg-gray-700 hover:bg-gray-600 disabled:opacity-50"
|
||||||
disabled
|
disabled
|
||||||
|
title="Previous page"
|
||||||
>
|
>
|
||||||
<i data-lucide="chevron-left"></i>
|
<i data-lucide="chevron-left" class="w-4 h-4"></i>
|
||||||
</button>
|
</button>
|
||||||
<span class="text-white font-medium">
|
<span class="text-sm text-white font-medium">
|
||||||
Page <span id="current-page-display-compare">1</span> of
|
Page <span id="current-page-display-compare">1</span> /
|
||||||
<span id="total-pages-display-compare">1</span>
|
<span id="total-pages-display-compare">1</span>
|
||||||
</span>
|
</span>
|
||||||
<button
|
<button
|
||||||
id="next-page-compare"
|
id="next-page-compare"
|
||||||
class="btn p-2 rounded-full bg-gray-700 hover:bg-gray-600 disabled:opacity-50"
|
class="btn p-1.5 rounded bg-gray-700 hover:bg-gray-600 disabled:opacity-50"
|
||||||
disabled
|
disabled
|
||||||
|
title="Next page"
|
||||||
>
|
>
|
||||||
<i data-lucide="chevron-right"></i>
|
<i data-lucide="chevron-right" class="w-4 h-4"></i>
|
||||||
</button>
|
</button>
|
||||||
|
|
||||||
<!-- Divider -->
|
<div class="border-l border-gray-700 h-5 mx-1"></div>
|
||||||
<div
|
|
||||||
class="border-l border-gray-600 h-6 mx-2 hidden sm:block"
|
|
||||||
></div>
|
|
||||||
|
|
||||||
<!-- View Mode Buttons -->
|
<div class="bg-gray-700 p-0.5 rounded flex gap-0.5">
|
||||||
<div class="bg-gray-700 p-1 rounded-md flex gap-1">
|
|
||||||
<button
|
<button
|
||||||
id="view-mode-overlay"
|
id="view-mode-overlay"
|
||||||
class="btn bg-indigo-600 px-3 py-1 rounded text-sm font-semibold"
|
class="btn bg-indigo-600 px-2.5 py-1 rounded text-xs font-semibold"
|
||||||
>
|
>
|
||||||
Overlay
|
Overlay
|
||||||
</button>
|
</button>
|
||||||
<button
|
<button
|
||||||
id="view-mode-side"
|
id="view-mode-side"
|
||||||
class="btn px-3 py-1 rounded text-sm font-semibold"
|
class="btn px-2.5 py-1 rounded text-xs font-semibold"
|
||||||
>
|
>
|
||||||
Side-by-Side
|
Side-by-Side
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Divider -->
|
<div class="border-l border-gray-700 h-5 mx-1"></div>
|
||||||
<div
|
|
||||||
class="border-l border-gray-600 h-6 mx-2 hidden sm:block"
|
|
||||||
></div>
|
|
||||||
|
|
||||||
<!-- Overlay Controls -->
|
<button
|
||||||
<div id="overlay-controls" class="flex items-center gap-2">
|
id="prev-change-btn"
|
||||||
|
class="btn bg-gray-700 hover:bg-gray-600 px-2.5 py-1 rounded text-xs font-semibold disabled:opacity-50"
|
||||||
|
disabled
|
||||||
|
title="Previous change"
|
||||||
|
>
|
||||||
|
<i data-lucide="chevron-up" class="w-3.5 h-3.5"></i>
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
id="next-change-btn"
|
||||||
|
class="btn bg-gray-700 hover:bg-gray-600 px-2.5 py-1 rounded text-xs font-semibold disabled:opacity-50"
|
||||||
|
disabled
|
||||||
|
title="Next change"
|
||||||
|
>
|
||||||
|
<i data-lucide="chevron-down" class="w-3.5 h-3.5"></i>
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<div class="flex-1"></div>
|
||||||
|
|
||||||
|
<div id="overlay-controls" class="hidden flex items-center gap-2">
|
||||||
<button
|
<button
|
||||||
id="flicker-btn"
|
id="flicker-btn"
|
||||||
class="btn bg-gray-700 hover:bg-gray-600 px-3 py-1 rounded-md text-sm font-semibold"
|
class="btn bg-gray-700 hover:bg-gray-600 px-2.5 py-1 rounded text-xs font-semibold"
|
||||||
>
|
>
|
||||||
Flicker
|
Flicker
|
||||||
</button>
|
</button>
|
||||||
<label
|
|
||||||
for="opacity-slider"
|
|
||||||
class="text-sm font-medium text-gray-300"
|
|
||||||
>Opacity:</label
|
|
||||||
>
|
|
||||||
<input
|
<input
|
||||||
type="range"
|
type="range"
|
||||||
id="opacity-slider"
|
id="opacity-slider"
|
||||||
@@ -270,46 +608,129 @@
|
|||||||
max="1"
|
max="1"
|
||||||
step="0.05"
|
step="0.05"
|
||||||
value="0.5"
|
value="0.5"
|
||||||
class="w-24 accent-indigo-500"
|
class="w-20 accent-indigo-500"
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Side-by-side Controls (hidden initially) -->
|
<div id="side-by-side-controls" class="flex items-center gap-2">
|
||||||
<div
|
|
||||||
id="side-by-side-controls"
|
|
||||||
class="hidden flex items-center gap-2"
|
|
||||||
>
|
|
||||||
<label
|
<label
|
||||||
class="flex items-center gap-2 text-sm font-medium text-gray-300 cursor-pointer"
|
class="flex items-center gap-1.5 text-xs text-gray-300 cursor-pointer"
|
||||||
>
|
>
|
||||||
<input
|
<input
|
||||||
type="checkbox"
|
type="checkbox"
|
||||||
id="sync-scroll-toggle"
|
id="sync-scroll-toggle"
|
||||||
checked
|
checked
|
||||||
class="w-4 h-4 rounded text-indigo-600 bg-gray-700 border-gray-600 focus:ring-indigo-500"
|
class="w-3.5 h-3.5 rounded text-indigo-600 bg-gray-700 border-gray-600"
|
||||||
/>
|
/>
|
||||||
Sync Scrolling
|
Sync scroll
|
||||||
</label>
|
</label>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<button
|
||||||
|
id="export-report-btn"
|
||||||
|
class="btn bg-gray-700 hover:bg-gray-600 p-1.5 rounded disabled:opacity-50"
|
||||||
|
disabled
|
||||||
|
title="Export report"
|
||||||
|
>
|
||||||
|
<i data-lucide="download" class="w-4 h-4"></i>
|
||||||
|
</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Viewer Wrapper -->
|
<div class="compare-workspace">
|
||||||
<div
|
<div
|
||||||
id="compare-viewer-wrapper"
|
id="compare-viewer-wrapper"
|
||||||
class="compare-viewer-wrapper overlay-mode bg-gray-900 rounded-lg border border-gray-700 min-h-[400px] relative"
|
class="compare-viewer-wrapper side-by-side-mode border border-slate-200 relative"
|
||||||
>
|
>
|
||||||
<div id="panel-1" class="overflow-auto">
|
<div id="panel-1" class="compare-panel overflow-auto">
|
||||||
<canvas id="canvas-compare-1" class="block mx-auto"></canvas>
|
<div class="compare-panel-label" id="compare-panel-label-1">
|
||||||
|
Original
|
||||||
</div>
|
</div>
|
||||||
<div id="panel-2" class="overflow-auto">
|
<div class="compare-canvas-stage">
|
||||||
|
<canvas id="canvas-compare-1" class="block mx-auto"></canvas>
|
||||||
|
<div id="highlights-1" class="compare-highlight-layer"></div>
|
||||||
|
<div
|
||||||
|
id="placeholder-1"
|
||||||
|
class="compare-placeholder hidden"
|
||||||
|
></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="panel-2" class="compare-panel overflow-auto">
|
||||||
|
<div class="compare-panel-label" id="compare-panel-label-2">
|
||||||
|
Modified
|
||||||
|
</div>
|
||||||
|
<div class="compare-canvas-stage">
|
||||||
<canvas
|
<canvas
|
||||||
id="canvas-compare-2"
|
id="canvas-compare-2"
|
||||||
class="block mx-auto"
|
class="block mx-auto"
|
||||||
style="opacity: 0.5"
|
style="opacity: 1"
|
||||||
></canvas>
|
></canvas>
|
||||||
|
<div id="highlights-2" class="compare-highlight-layer"></div>
|
||||||
|
<div
|
||||||
|
id="placeholder-2"
|
||||||
|
class="compare-placeholder hidden"
|
||||||
|
></div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<aside class="compare-sidebar">
|
||||||
|
<div class="compare-sidebar-header">
|
||||||
|
<div class="relative">
|
||||||
|
<span
|
||||||
|
class="absolute inset-y-0 left-0 flex items-center pl-3"
|
||||||
|
>
|
||||||
|
<i data-lucide="search" class="w-4 h-4 text-gray-400"></i>
|
||||||
|
</span>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
id="compare-search-input"
|
||||||
|
placeholder="Search changes..."
|
||||||
|
class="w-full pl-9 pr-3 py-2 bg-gray-700 text-white text-sm border border-gray-600 rounded-lg focus:ring-indigo-500 focus:border-indigo-500"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="compare-sidebar-filters">
|
||||||
|
<button id="filter-removed" class="compare-pill removed">
|
||||||
|
<span class="compare-change-dot removed"></span>
|
||||||
|
<span id="summary-removed-count">0</span> Deleted
|
||||||
|
</button>
|
||||||
|
<button id="filter-added" class="compare-pill added">
|
||||||
|
<span class="compare-change-dot added"></span>
|
||||||
|
<span id="summary-added-count">0</span> Added
|
||||||
|
</button>
|
||||||
|
<button id="filter-modified" class="compare-pill modified">
|
||||||
|
<span class="compare-change-dot modified"></span>
|
||||||
|
<span id="summary-modified-count">0</span> Modified
|
||||||
|
</button>
|
||||||
|
<label
|
||||||
|
class="compare-pill"
|
||||||
|
style="
|
||||||
|
color: #94a3b8;
|
||||||
|
background: rgba(51, 65, 85, 0.3);
|
||||||
|
border-color: rgba(51, 65, 85, 0.4);
|
||||||
|
cursor: pointer;
|
||||||
|
"
|
||||||
|
>
|
||||||
|
<input
|
||||||
|
id="ocr-toggle"
|
||||||
|
type="checkbox"
|
||||||
|
checked
|
||||||
|
class="w-3 h-3 rounded text-indigo-600 bg-gray-700 border-gray-600"
|
||||||
|
/>
|
||||||
|
OCR
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="compare-change-list">
|
||||||
|
<div id="change-list-empty" class="compare-change-empty">
|
||||||
|
Upload two PDFs to see differences.
|
||||||
|
</div>
|
||||||
|
<div id="compare-change-list" class="hidden"></div>
|
||||||
|
</div>
|
||||||
|
</aside>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
313
src/tests/compare/diff-text-runs.test.ts
Normal file
313
src/tests/compare/diff-text-runs.test.ts
Normal file
@@ -0,0 +1,313 @@
|
|||||||
|
import { describe, expect, it } from 'vitest';
|
||||||
|
|
||||||
|
import { comparePageModels } from '@/js/compare/engine/compare-page-models.ts';
|
||||||
|
import { diffTextRuns } from '@/js/compare/engine/diff-text-runs.ts';
|
||||||
|
import {
|
||||||
|
mergeIntoLines,
|
||||||
|
sortCompareTextItems,
|
||||||
|
} from '@/js/compare/engine/extract-page-model.ts';
|
||||||
|
import type { ComparePageModel, CompareTextItem } from '@/js/compare/types.ts';
|
||||||
|
|
||||||
|
function makeItem(id: string, text: string): CompareTextItem {
|
||||||
|
return {
|
||||||
|
id,
|
||||||
|
text,
|
||||||
|
normalizedText: text,
|
||||||
|
rect: { x: 0, y: 0, width: 10, height: 10 },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function makePage(
|
||||||
|
pageNumber: number,
|
||||||
|
textItems: CompareTextItem[]
|
||||||
|
): ComparePageModel {
|
||||||
|
return {
|
||||||
|
pageNumber,
|
||||||
|
width: 100,
|
||||||
|
height: 100,
|
||||||
|
textItems,
|
||||||
|
plainText: textItems.map((item) => item.normalizedText).join(' '),
|
||||||
|
hasText: textItems.length > 0,
|
||||||
|
source: 'pdfjs',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
describe('diffTextRuns', () => {
|
||||||
|
it('detects modified tokens as one change', () => {
|
||||||
|
const result = diffTextRuns(
|
||||||
|
[makeItem('a', 'Hello'), makeItem('b', 'world')],
|
||||||
|
[makeItem('a', 'Hello'), makeItem('c', 'there')]
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result.summary).toEqual({ added: 0, removed: 0, modified: 1 });
|
||||||
|
expect(result.changes).toHaveLength(1);
|
||||||
|
expect(result.changes[0].type).toBe('modified');
|
||||||
|
expect(result.changes[0].beforeText).toBe('world');
|
||||||
|
expect(result.changes[0].afterText).toBe('there');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('detects added tokens', () => {
|
||||||
|
const result = diffTextRuns(
|
||||||
|
[makeItem('a', 'Hello')],
|
||||||
|
[makeItem('a', 'Hello'), makeItem('b', 'again')]
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result.summary).toEqual({ added: 1, removed: 0, modified: 0 });
|
||||||
|
expect(result.changes[0].type).toBe('added');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('splits compound replacements into discrete changes', () => {
|
||||||
|
const result = diffTextRuns(
|
||||||
|
[
|
||||||
|
makeItem('a', 'This'),
|
||||||
|
makeItem('b', 'is'),
|
||||||
|
makeItem('c', 'an'),
|
||||||
|
makeItem('d', 'example'),
|
||||||
|
makeItem('e', 'of'),
|
||||||
|
makeItem('f', 'a'),
|
||||||
|
makeItem('g', 'data'),
|
||||||
|
makeItem('h', 'table'),
|
||||||
|
makeItem('i', 'new.'),
|
||||||
|
makeItem('j', 'Disabilit'),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
makeItem('k', 'Example'),
|
||||||
|
makeItem('l', 'table'),
|
||||||
|
makeItem('m', 'This'),
|
||||||
|
makeItem('n', 'is'),
|
||||||
|
makeItem('o', 'an'),
|
||||||
|
makeItem('p', 'example'),
|
||||||
|
makeItem('q', 'of'),
|
||||||
|
makeItem('r', 'a'),
|
||||||
|
makeItem('s', 'data'),
|
||||||
|
makeItem('t', 'table.'),
|
||||||
|
makeItem('u', 'Disability'),
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result.changes).toHaveLength(2);
|
||||||
|
expect(result.summary).toEqual({ added: 1, removed: 0, modified: 1 });
|
||||||
|
expect(
|
||||||
|
result.changes.some(
|
||||||
|
(change) =>
|
||||||
|
change.type === 'added' && change.afterText === 'Example table'
|
||||||
|
)
|
||||||
|
).toBe(true);
|
||||||
|
expect(
|
||||||
|
result.changes.some(
|
||||||
|
(change) =>
|
||||||
|
change.type === 'modified' &&
|
||||||
|
change.beforeText === 'table new. Disabilit' &&
|
||||||
|
change.afterText === 'table. Disability'
|
||||||
|
)
|
||||||
|
).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('comparePageModels', () => {
|
||||||
|
it('marks pages missing from the second document', () => {
|
||||||
|
const result = comparePageModels(
|
||||||
|
makePage(3, [makeItem('a', 'Only')]),
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result.status).toBe('left-only');
|
||||||
|
expect(result.summary.removed).toBe(1);
|
||||||
|
expect(result.changes[0].type).toBe('page-removed');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('sortCompareTextItems', () => {
|
||||||
|
it('orders tokens by reading order', () => {
|
||||||
|
const items: CompareTextItem[] = [
|
||||||
|
{
|
||||||
|
...makeItem('b', 'Body'),
|
||||||
|
rect: { x: 60, y: 40, width: 10, height: 10 },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
...makeItem('a', 'Title'),
|
||||||
|
rect: { x: 10, y: 10, width: 10, height: 10 },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
...makeItem('c', 'Next'),
|
||||||
|
rect: { x: 10, y: 40, width: 10, height: 10 },
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(
|
||||||
|
sortCompareTextItems(items).map((item) => item.normalizedText)
|
||||||
|
).toEqual(['Title', 'Next', 'Body']);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('mergeIntoLines', () => {
|
||||||
|
it('merges items on the same Y-line into one item', () => {
|
||||||
|
const items: CompareTextItem[] = [
|
||||||
|
{
|
||||||
|
id: '0',
|
||||||
|
text: 'Hello',
|
||||||
|
normalizedText: 'Hello',
|
||||||
|
rect: { x: 0, y: 10, width: 50, height: 12 },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: '1',
|
||||||
|
text: 'World',
|
||||||
|
normalizedText: 'World',
|
||||||
|
rect: { x: 60, y: 10, width: 50, height: 12 },
|
||||||
|
},
|
||||||
|
];
|
||||||
|
const merged = mergeIntoLines(sortCompareTextItems(items));
|
||||||
|
|
||||||
|
expect(merged).toHaveLength(1);
|
||||||
|
expect(merged[0].normalizedText).toBe('Hello World');
|
||||||
|
expect(merged[0].rect.x).toBe(0);
|
||||||
|
expect(merged[0].rect.width).toBe(110);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does not insert spaces inside a split word', () => {
|
||||||
|
const items: CompareTextItem[] = [
|
||||||
|
{
|
||||||
|
id: '0',
|
||||||
|
text: 'sam',
|
||||||
|
normalizedText: 'sam',
|
||||||
|
rect: { x: 0, y: 10, width: 24, height: 12 },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: '1',
|
||||||
|
text: 'e',
|
||||||
|
normalizedText: 'e',
|
||||||
|
rect: { x: 24.4, y: 10, width: 8, height: 12 },
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const merged = mergeIntoLines(sortCompareTextItems(items));
|
||||||
|
|
||||||
|
expect(merged).toHaveLength(1);
|
||||||
|
expect(merged[0].normalizedText).toBe('same');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('keeps items on different Y-lines separate', () => {
|
||||||
|
const items: CompareTextItem[] = [
|
||||||
|
{
|
||||||
|
id: '0',
|
||||||
|
text: 'Line 1',
|
||||||
|
normalizedText: 'Line 1',
|
||||||
|
rect: { x: 0, y: 10, width: 50, height: 12 },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: '1',
|
||||||
|
text: 'Line 2',
|
||||||
|
normalizedText: 'Line 2',
|
||||||
|
rect: { x: 0, y: 30, width: 50, height: 12 },
|
||||||
|
},
|
||||||
|
];
|
||||||
|
const merged = mergeIntoLines(sortCompareTextItems(items));
|
||||||
|
|
||||||
|
expect(merged).toHaveLength(2);
|
||||||
|
expect(merged[0].normalizedText).toBe('Line 1');
|
||||||
|
expect(merged[1].normalizedText).toBe('Line 2');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('produces same result for different text run boundaries', () => {
|
||||||
|
const pdf1Items: CompareTextItem[] = [
|
||||||
|
{
|
||||||
|
id: '0',
|
||||||
|
text: 'Hello World',
|
||||||
|
normalizedText: 'Hello World',
|
||||||
|
rect: { x: 0, y: 10, width: 100, height: 12 },
|
||||||
|
},
|
||||||
|
];
|
||||||
|
const pdf2Items: CompareTextItem[] = [
|
||||||
|
{
|
||||||
|
id: '0',
|
||||||
|
text: 'Hello',
|
||||||
|
normalizedText: 'Hello',
|
||||||
|
rect: { x: 0, y: 10, width: 45, height: 12 },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: '1',
|
||||||
|
text: 'World',
|
||||||
|
normalizedText: 'World',
|
||||||
|
rect: { x: 55, y: 10, width: 45, height: 12 },
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const merged1 = mergeIntoLines(sortCompareTextItems(pdf1Items));
|
||||||
|
const merged2 = mergeIntoLines(sortCompareTextItems(pdf2Items));
|
||||||
|
|
||||||
|
expect(merged1[0].normalizedText).toBe(merged2[0].normalizedText);
|
||||||
|
|
||||||
|
const result = diffTextRuns(merged1, merged2);
|
||||||
|
expect(result.changes).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('detects actual changes after merging', () => {
|
||||||
|
const pdf1Items: CompareTextItem[] = [
|
||||||
|
{
|
||||||
|
id: '0',
|
||||||
|
text: 'Sample',
|
||||||
|
normalizedText: 'Sample',
|
||||||
|
rect: { x: 0, y: 10, width: 60, height: 14 },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: '1',
|
||||||
|
text: 'page text here',
|
||||||
|
normalizedText: 'page text here',
|
||||||
|
rect: { x: 0, y: 30, width: 120, height: 14 },
|
||||||
|
},
|
||||||
|
];
|
||||||
|
const pdf2Items: CompareTextItem[] = [
|
||||||
|
{
|
||||||
|
id: '0',
|
||||||
|
text: 'Sample',
|
||||||
|
normalizedText: 'Sample',
|
||||||
|
rect: { x: 0, y: 10, width: 45, height: 14 },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: '1',
|
||||||
|
text: 'PDF',
|
||||||
|
normalizedText: 'PDF',
|
||||||
|
rect: { x: 55, y: 10, width: 30, height: 14 },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: '2',
|
||||||
|
text: 'pages text here',
|
||||||
|
normalizedText: 'pages text here',
|
||||||
|
rect: { x: 0, y: 30, width: 125, height: 14 },
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const merged1 = mergeIntoLines(sortCompareTextItems(pdf1Items));
|
||||||
|
const merged2 = mergeIntoLines(sortCompareTextItems(pdf2Items));
|
||||||
|
|
||||||
|
expect(merged1).toHaveLength(2);
|
||||||
|
expect(merged2).toHaveLength(2);
|
||||||
|
|
||||||
|
const result = diffTextRuns(merged1, merged2);
|
||||||
|
expect(result.summary.modified).toBe(1);
|
||||||
|
expect(result.summary.added).toBe(0);
|
||||||
|
expect(result.summary.removed).toBe(0);
|
||||||
|
expect(result.changes).toHaveLength(1);
|
||||||
|
expect(result.changes[0].beforeText).toBe('page');
|
||||||
|
expect(result.changes[0].afterText).toBe('PDF pages');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('preserves original casing in change descriptions', () => {
|
||||||
|
const result = diffTextRuns(
|
||||||
|
[makeItem('a', 'Sample')],
|
||||||
|
[makeItem('b', 'Sample PDF')]
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result.changes[0].afterText).toBe('PDF');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('ignores joined versus split words when collapsed text matches', () => {
|
||||||
|
const result = diffTextRuns(
|
||||||
|
[makeItem('a', 'non'), makeItem('b', 'tincidunt')],
|
||||||
|
[makeItem('c', 'nontincidunt')]
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result.changes).toHaveLength(0);
|
||||||
|
expect(result.summary).toEqual({ added: 0, removed: 0, modified: 0 });
|
||||||
|
});
|
||||||
|
});
|
||||||
42
src/tests/compare/pair-pages.test.ts
Normal file
42
src/tests/compare/pair-pages.test.ts
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import { describe, expect, it } from 'vitest';
|
||||||
|
|
||||||
|
import { pairPages } from '@/js/compare/engine/pair-pages.ts';
|
||||||
|
import type { ComparePageSignature } from '@/js/compare/types.ts';
|
||||||
|
|
||||||
|
function signature(pageNumber: number, text: string): ComparePageSignature {
|
||||||
|
return {
|
||||||
|
pageNumber,
|
||||||
|
plainText: text,
|
||||||
|
hasText: text.length > 0,
|
||||||
|
tokenItems: text
|
||||||
|
.split(/\s+/)
|
||||||
|
.filter(Boolean)
|
||||||
|
.map((token, index) => ({
|
||||||
|
id: `${pageNumber}-${index}`,
|
||||||
|
text: token,
|
||||||
|
normalizedText: token,
|
||||||
|
rect: { x: 0, y: 0, width: 0, height: 0 },
|
||||||
|
})),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
describe('pairPages', () => {
|
||||||
|
it('pairs reordered and inserted pages without collapsing alignment', () => {
|
||||||
|
const pairs = pairPages(
|
||||||
|
[signature(1, 'alpha beta'), signature(2, 'gamma delta')],
|
||||||
|
[
|
||||||
|
signature(1, 'intro page'),
|
||||||
|
signature(2, 'alpha beta'),
|
||||||
|
signature(3, 'gamma delta'),
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(pairs).toHaveLength(3);
|
||||||
|
expect(pairs[0]).toMatchObject({
|
||||||
|
leftPageNumber: null,
|
||||||
|
rightPageNumber: 1,
|
||||||
|
});
|
||||||
|
expect(pairs[1]).toMatchObject({ leftPageNumber: 1, rightPageNumber: 2 });
|
||||||
|
expect(pairs[2]).toMatchObject({ leftPageNumber: 2, rightPageNumber: 3 });
|
||||||
|
});
|
||||||
|
});
|
||||||
29
src/tests/compare/text-normalization.test.ts
Normal file
29
src/tests/compare/text-normalization.test.ts
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
import { describe, expect, it } from 'vitest';
|
||||||
|
|
||||||
|
import {
|
||||||
|
isLowQualityExtractedText,
|
||||||
|
joinNormalizedText,
|
||||||
|
normalizeCompareText,
|
||||||
|
} from '@/js/compare/engine/text-normalization.ts';
|
||||||
|
|
||||||
|
describe('text normalization', () => {
|
||||||
|
it('joins punctuation without inserting stray spaces', () => {
|
||||||
|
expect(joinNormalizedText(['Example', 'table', ':', 'v2'])).toBe(
|
||||||
|
'Example table: v2'
|
||||||
|
);
|
||||||
|
expect(joinNormalizedText(['"', 'Quoted', 'text', '"'])).toBe(
|
||||||
|
'"Quoted text"'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('normalizes private-use and control characters away', () => {
|
||||||
|
expect(normalizeCompareText('A\u0000B\uE000C')).toBe('A B C');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('flags punctuation-heavy extraction as low quality', () => {
|
||||||
|
expect(isLowQualityExtractedText('! " # $ % & \'')).toBe(true);
|
||||||
|
expect(isLowQualityExtractedText('Example table 2026 revision')).toBe(
|
||||||
|
false
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user