refactor: update PDF comparison types and enhance UI for better usability

- Refactored CompareState to import from a centralized type definition.
- Enhanced the compare-pdfs.html layout with improved styles for overlay and side-by-side modes.
- Added new CSS styles for various UI components including panels, buttons, and highlights.
- Implemented a new sidebar for displaying change summaries and filters.
- Created unit tests for text comparison logic, including diffing text runs and page pairing.
- Added tests for text normalization functions to ensure proper handling of punctuation and character normalization.
This commit is contained in:
alam00000
2026-03-08 23:55:04 +05:30
parent 86cbaf6cd3
commit 1d68691331
20 changed files with 3447 additions and 332 deletions

43
package-lock.json generated
View File

@@ -1,12 +1,12 @@
{ {
"name": "bento-pdf", "name": "bento-pdf",
"version": "2.4.0", "version": "2.4.1",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "bento-pdf", "name": "bento-pdf",
"version": "2.4.0", "version": "2.4.1",
"license": "AGPL-3.0-only", "license": "AGPL-3.0-only",
"dependencies": { "dependencies": {
"@fontsource/cedarville-cursive": "^5.2.7", "@fontsource/cedarville-cursive": "^5.2.7",
@@ -30,6 +30,7 @@
"blob-stream": "^0.1.3", "blob-stream": "^0.1.3",
"bwip-js": "^4.8.0", "bwip-js": "^4.8.0",
"cropperjs": "^1.6.2", "cropperjs": "^1.6.2",
"diff": "^8.0.3",
"embedpdf-snippet": "file:vendor/embedpdf/embedpdf-snippet-2.3.0.tgz", "embedpdf-snippet": "file:vendor/embedpdf/embedpdf-snippet-2.3.0.tgz",
"heic2any": "^0.0.4", "heic2any": "^0.0.4",
"highlight.js": "^11.11.1", "highlight.js": "^11.11.1",
@@ -55,11 +56,13 @@
"markdown-it-task-lists": "^2.1.1", "markdown-it-task-lists": "^2.1.1",
"markdown-it-toc-done-right": "^4.2.0", "markdown-it-toc-done-right": "^4.2.0",
"mermaid": "^11.12.3", "mermaid": "^11.12.3",
"microdiff": "^1.5.0",
"node-forge": "^1.3.3", "node-forge": "^1.3.3",
"papaparse": "^5.5.3", "papaparse": "^5.5.3",
"pdf-lib": "^1.17.1", "pdf-lib": "^1.17.1",
"pdfjs-dist": "^5.4.624", "pdfjs-dist": "^5.4.624",
"pdfkit": "^0.17.2", "pdfkit": "^0.17.2",
"pixelmatch": "^7.1.0",
"postal-mime": "^2.7.3", "postal-mime": "^2.7.3",
"rete": "^2.0.6", "rete": "^2.0.6",
"rete-area-plugin": "^2.1.5", "rete-area-plugin": "^2.1.5",
@@ -6353,6 +6356,15 @@
"integrity": "sha512-ED3jP8saaweFTjeGX8HQPjeC1YYyZs98jGNZx6IiBvxW7JG5v492kamAQB3m2wop07CvU/RQmzcKr6bgcC5D/Q==", "integrity": "sha512-ED3jP8saaweFTjeGX8HQPjeC1YYyZs98jGNZx6IiBvxW7JG5v492kamAQB3m2wop07CvU/RQmzcKr6bgcC5D/Q==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/diff": {
"version": "8.0.3",
"resolved": "https://registry.npmjs.org/diff/-/diff-8.0.3.tgz",
"integrity": "sha512-qejHi7bcSD4hQAZE0tNAawRK1ZtafHDmMTMkrrIGgSLl7hTnQHmKCeB45xAcbfTqK2zowkM3j3bHt/4b/ARbYQ==",
"license": "BSD-3-Clause",
"engines": {
"node": ">=0.3.1"
}
},
"node_modules/diffie-hellman": { "node_modules/diffie-hellman": {
"version": "5.0.3", "version": "5.0.3",
"resolved": "https://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz", "resolved": "https://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz",
@@ -9068,6 +9080,12 @@
"uuid": "^11.1.0" "uuid": "^11.1.0"
} }
}, },
"node_modules/microdiff": {
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/microdiff/-/microdiff-1.5.0.tgz",
"integrity": "sha512-Drq+/THMvDdzRYrK0oxJmOKiC24ayUV8ahrt8l3oRK51PWt6gdtrIGrlIH3pT/lFh1z93FbAcidtsHcWbnRz8Q==",
"license": "MIT"
},
"node_modules/micromark-util-character": { "node_modules/micromark-util-character": {
"version": "2.1.1", "version": "2.1.1",
"resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.1.tgz", "resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.1.tgz",
@@ -9896,6 +9914,18 @@
"url": "https://github.com/sponsors/jonschlinkert" "url": "https://github.com/sponsors/jonschlinkert"
} }
}, },
"node_modules/pixelmatch": {
"version": "7.1.0",
"resolved": "https://registry.npmjs.org/pixelmatch/-/pixelmatch-7.1.0.tgz",
"integrity": "sha512-1wrVzJ2STrpmONHKBy228LM1b84msXDUoAzVEl0R8Mz4Ce6EPr+IVtxm8+yvrqLYMHswREkjYFaMxnyGnaY3Ng==",
"license": "ISC",
"dependencies": {
"pngjs": "^7.0.0"
},
"bin": {
"pixelmatch": "bin/pixelmatch"
}
},
"node_modules/pkg-dir": { "node_modules/pkg-dir": {
"version": "5.0.0", "version": "5.0.0",
"resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-5.0.0.tgz", "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-5.0.0.tgz",
@@ -9925,6 +9955,15 @@
"resolved": "https://registry.npmjs.org/png-js/-/png-js-1.0.0.tgz", "resolved": "https://registry.npmjs.org/png-js/-/png-js-1.0.0.tgz",
"integrity": "sha512-k+YsbhpA9e+EFfKjTCH3VW6aoKlyNYI6NYdTfDL4CIvFnvsuO84ttonmZE7rc+v23SLTH8XX+5w/Ak9v0xGY4g==" "integrity": "sha512-k+YsbhpA9e+EFfKjTCH3VW6aoKlyNYI6NYdTfDL4CIvFnvsuO84ttonmZE7rc+v23SLTH8XX+5w/Ak9v0xGY4g=="
}, },
"node_modules/pngjs": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/pngjs/-/pngjs-7.0.0.tgz",
"integrity": "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow==",
"license": "MIT",
"engines": {
"node": ">=14.19.0"
}
},
"node_modules/points-on-curve": { "node_modules/points-on-curve": {
"version": "0.2.0", "version": "0.2.0",
"resolved": "https://registry.npmjs.org/points-on-curve/-/points-on-curve-0.2.0.tgz", "resolved": "https://registry.npmjs.org/points-on-curve/-/points-on-curve-0.2.0.tgz",

View File

@@ -86,6 +86,7 @@
"blob-stream": "^0.1.3", "blob-stream": "^0.1.3",
"bwip-js": "^4.8.0", "bwip-js": "^4.8.0",
"cropperjs": "^1.6.2", "cropperjs": "^1.6.2",
"diff": "^8.0.3",
"embedpdf-snippet": "file:vendor/embedpdf/embedpdf-snippet-2.3.0.tgz", "embedpdf-snippet": "file:vendor/embedpdf/embedpdf-snippet-2.3.0.tgz",
"heic2any": "^0.0.4", "heic2any": "^0.0.4",
"highlight.js": "^11.11.1", "highlight.js": "^11.11.1",
@@ -111,11 +112,13 @@
"markdown-it-task-lists": "^2.1.1", "markdown-it-task-lists": "^2.1.1",
"markdown-it-toc-done-right": "^4.2.0", "markdown-it-toc-done-right": "^4.2.0",
"mermaid": "^11.12.3", "mermaid": "^11.12.3",
"microdiff": "^1.5.0",
"node-forge": "^1.3.3", "node-forge": "^1.3.3",
"papaparse": "^5.5.3", "papaparse": "^5.5.3",
"pdf-lib": "^1.17.1", "pdf-lib": "^1.17.1",
"pdfjs-dist": "^5.4.624", "pdfjs-dist": "^5.4.624",
"pdfkit": "^0.17.2", "pdfkit": "^0.17.2",
"pixelmatch": "^7.1.0",
"postal-mime": "^2.7.3", "postal-mime": "^2.7.3",
"rete": "^2.0.6", "rete": "^2.0.6",
"rete-area-plugin": "^2.1.5", "rete-area-plugin": "^2.1.5",

View File

@@ -238,19 +238,6 @@ input[type='file']::file-selector-button {
position: relative; position: relative;
width: 100%; width: 100%;
height: 75vh; height: 75vh;
overflow: auto;
border: 2px solid #374151;
border-radius: 0.5rem;
background-color: #1f2937;
}
/* This rule now ONLY applies to canvases in overlay mode */
.compare-viewer-wrapper.overlay-mode canvas {
position: absolute;
top: 0;
left: 0;
width: 100%;
height: auto;
} }
.compare-viewer-wrapper.side-by-side-mode { .compare-viewer-wrapper.side-by-side-mode {

View File

@@ -0,0 +1,78 @@
import type { ComparePageModel, ComparePageResult } from '../types.ts';
import { diffTextRuns } from './diff-text-runs.ts';
export function comparePageModels(
leftPage: ComparePageModel | null,
rightPage: ComparePageModel | null
): ComparePageResult {
if (leftPage && !rightPage) {
return {
status: 'left-only',
leftPageNumber: leftPage.pageNumber,
rightPageNumber: null,
changes: [
{
id: 'page-removed',
type: 'page-removed',
description: `Page ${leftPage.pageNumber} exists only in the first PDF.`,
beforeText: leftPage.plainText.slice(0, 200),
afterText: '',
beforeRects: [],
afterRects: [],
},
],
summary: { added: 0, removed: 1, modified: 0 },
visualDiff: null,
usedOcr: leftPage.source === 'ocr',
};
}
if (!leftPage && rightPage) {
return {
status: 'right-only',
leftPageNumber: null,
rightPageNumber: rightPage.pageNumber,
changes: [
{
id: 'page-added',
type: 'page-added',
description: `Page ${rightPage.pageNumber} exists only in the second PDF.`,
beforeText: '',
afterText: rightPage.plainText.slice(0, 200),
beforeRects: [],
afterRects: [],
},
],
summary: { added: 1, removed: 0, modified: 0 },
visualDiff: null,
usedOcr: rightPage.source === 'ocr',
};
}
if (!leftPage || !rightPage) {
return {
status: 'match',
leftPageNumber: null,
rightPageNumber: null,
changes: [],
summary: { added: 0, removed: 0, modified: 0 },
visualDiff: null,
usedOcr: false,
};
}
const { changes, summary } = diffTextRuns(
leftPage.textItems,
rightPage.textItems
);
return {
status: changes.length > 0 ? 'changed' : 'match',
leftPageNumber: leftPage.pageNumber,
rightPageNumber: rightPage.pageNumber,
changes,
summary,
visualDiff: null,
usedOcr: leftPage.source === 'ocr' || rightPage.source === 'ocr',
};
}

View File

@@ -0,0 +1,237 @@
import { diffArrays } from 'diff';
import type {
CharPosition,
CompareChangeSummary,
CompareRectangle,
CompareTextChange,
CompareTextItem,
CompareWordToken,
} from '../types.ts';
interface WordToken {
word: string;
compareWord: string;
rect: CompareRectangle;
}
function getCharMap(line: CompareTextItem): CharPosition[] {
if (line.charMap && line.charMap.length === line.normalizedText.length) {
return line.charMap;
}
const charWidth = line.rect.width / Math.max(line.normalizedText.length, 1);
return Array.from({ length: line.normalizedText.length }, (_, i) => ({
x: line.rect.x + i * charWidth,
width: charWidth,
}));
}
function splitLineIntoWords(line: CompareTextItem): WordToken[] {
if (line.wordTokens && line.wordTokens.length > 0) {
return line.wordTokens.map((token: CompareWordToken) => ({
word: token.word,
compareWord: token.compareWord,
rect: token.rect,
}));
}
const words = line.normalizedText.split(/\s+/).filter(Boolean);
if (words.length === 0) return [];
const charMap = getCharMap(line);
let offset = 0;
return words.map((word) => {
const startIndex = line.normalizedText.indexOf(word, offset);
const endIndex = startIndex + word.length - 1;
offset = startIndex + word.length;
const startChar = charMap[startIndex];
const endChar = charMap[endIndex];
if (!startChar || !endChar) {
const charWidth =
line.rect.width / Math.max(line.normalizedText.length, 1);
return {
word,
compareWord: word.toLowerCase(),
rect: {
x: line.rect.x + startIndex * charWidth,
y: line.rect.y,
width: word.length * charWidth,
height: line.rect.height,
},
};
}
const x = startChar.x;
const w = endChar.x + endChar.width - startChar.x;
return {
word,
compareWord: word.toLowerCase(),
rect: { x, y: line.rect.y, width: w, height: line.rect.height },
};
});
}
function groupAdjacentRects(rects: CompareRectangle[]): CompareRectangle[] {
if (rects.length === 0) return [];
const sorted = [...rects].sort((a, b) => a.y - b.y || a.x - b.x);
const groups: CompareRectangle[][] = [[sorted[0]]];
for (let i = 1; i < sorted.length; i++) {
const prev = groups[groups.length - 1];
const lastRect = prev[prev.length - 1];
const curr = sorted[i];
const sameLine =
Math.abs(curr.y - lastRect.y) < Math.max(lastRect.height * 0.6, 4);
const close = curr.x <= lastRect.x + lastRect.width + lastRect.height * 2;
if (sameLine && close) {
prev.push(curr);
} else {
groups.push([curr]);
}
}
return groups.map((group) => {
const minX = Math.min(...group.map((r) => r.x));
const minY = Math.min(...group.map((r) => r.y));
const maxX = Math.max(...group.map((r) => r.x + r.width));
const maxY = Math.max(...group.map((r) => r.y + r.height));
return { x: minX, y: minY, width: maxX - minX, height: maxY - minY };
});
}
function collapseWords(words: WordToken[]) {
return words.map((word) => word.compareWord).join('');
}
function areEquivalentIgnoringWordBreaks(
beforeWords: WordToken[],
afterWords: WordToken[]
) {
if (beforeWords.length === 0 || afterWords.length === 0) {
return false;
}
return collapseWords(beforeWords) === collapseWords(afterWords);
}
function createWordChange(
changes: CompareTextChange[],
type: CompareTextChange['type'],
beforeWords: WordToken[],
afterWords: WordToken[]
) {
const beforeText = beforeWords.map((w) => w.word).join(' ');
const afterText = afterWords.map((w) => w.word).join(' ');
if (!beforeText && !afterText) return;
const id = `${type}-${changes.length}`;
const beforeRects = groupAdjacentRects(beforeWords.map((w) => w.rect));
const afterRects = groupAdjacentRects(afterWords.map((w) => w.rect));
if (type === 'modified') {
changes.push({
id,
type,
description: `Replaced "${beforeText}" with "${afterText}"`,
beforeText,
afterText,
beforeRects,
afterRects,
});
} else if (type === 'removed') {
changes.push({
id,
type,
description: `Removed "${beforeText}"`,
beforeText,
afterText: '',
beforeRects,
afterRects: [],
});
} else {
changes.push({
id,
type,
description: `Added "${afterText}"`,
beforeText: '',
afterText,
beforeRects: [],
afterRects,
});
}
}
function toSummary(changes: CompareTextChange[]): CompareChangeSummary {
return changes.reduce(
(summary, change) => {
if (change.type === 'added') summary.added += 1;
if (change.type === 'removed') summary.removed += 1;
if (change.type === 'modified') summary.modified += 1;
return summary;
},
{ added: 0, removed: 0, modified: 0 }
);
}
export function diffTextRuns(
beforeItems: CompareTextItem[],
afterItems: CompareTextItem[]
) {
const beforeWords = beforeItems.flatMap(splitLineIntoWords);
const afterWords = afterItems.flatMap(splitLineIntoWords);
const rawChanges = diffArrays(
beforeWords.map((w) => w.compareWord),
afterWords.map((w) => w.compareWord)
);
const changes: CompareTextChange[] = [];
let beforeIndex = 0;
let afterIndex = 0;
for (let i = 0; i < rawChanges.length; i++) {
const change = rawChanges[i];
const count = change.value.length;
if (change.removed) {
const removedTokens = beforeWords.slice(beforeIndex, beforeIndex + count);
beforeIndex += count;
const next = rawChanges[i + 1];
if (next?.added) {
const addedTokens = afterWords.slice(
afterIndex,
afterIndex + next.value.length
);
afterIndex += next.value.length;
if (areEquivalentIgnoringWordBreaks(removedTokens, addedTokens)) {
i++;
continue;
}
createWordChange(changes, 'modified', removedTokens, addedTokens);
i++;
} else {
createWordChange(changes, 'removed', removedTokens, []);
}
continue;
}
if (change.added) {
const addedTokens = afterWords.slice(afterIndex, afterIndex + count);
afterIndex += count;
createWordChange(changes, 'added', [], addedTokens);
continue;
}
beforeIndex += count;
afterIndex += count;
}
return { changes, summary: toSummary(changes) };
}

View File

@@ -0,0 +1,520 @@
import * as pdfjsLib from 'pdfjs-dist';
import type {
ComparePageModel,
CompareTextItem,
CharPosition,
CompareWordToken,
} from '../types.ts';
import {
joinCompareTextItems,
normalizeCompareText,
} from './text-normalization.ts';
type PageTextItem = {
str: string;
width: number;
height: number;
transform: number[];
dir: string;
fontName: string;
hasEOL: boolean;
};
type TextStyles = Record<string, { fontFamily?: string }>;
const measurementCanvas =
typeof document !== 'undefined' ? document.createElement('canvas') : null;
const measurementContext = measurementCanvas
? measurementCanvas.getContext('2d')
: null;
const textMeasurementCache: Map<string, number> | null = measurementContext
? new Map()
: null;
let lastMeasurementFont = '';
const DEFAULT_CHAR_WIDTH = 1;
const DEFAULT_SPACE_WIDTH = 0.33;
function shouldJoinTokenWithPrevious(previous: string, current: string) {
if (!previous) return false;
if (/^[,.;:!?%)\]}]/.test(current)) return true;
if (/^[''"']/u.test(current)) return true;
if (/[([{/"'-]$/u.test(previous)) return true;
return false;
}
function measureTextWidth(fontSpec: string, text: string): number {
if (!measurementContext) {
if (!text) return 0;
if (text === ' ') return DEFAULT_SPACE_WIDTH;
return text.length * DEFAULT_CHAR_WIDTH;
}
if (lastMeasurementFont !== fontSpec) {
measurementContext.font = fontSpec;
lastMeasurementFont = fontSpec;
}
const key = `${fontSpec}|${text}`;
const cached = textMeasurementCache?.get(key);
if (cached !== undefined) {
return cached;
}
const width = measurementContext.measureText(text).width || 0;
textMeasurementCache?.set(key, width);
return width;
}
function buildItemWordTokens(
viewport: pdfjsLib.PageViewport,
item: PageTextItem,
fallbackRect: CompareTextItem['rect'],
styles: TextStyles
): CompareWordToken[] {
const rawText = item.str || '';
if (!rawText.trim()) {
return [];
}
const totalLen = Math.max(rawText.length, 1);
const textStyle = item.fontName ? styles[item.fontName] : undefined;
const fontFamily = textStyle?.fontFamily ?? 'sans-serif';
const fontScale = Math.max(
0.5,
Math.hypot(item.transform[0], item.transform[1]) || 0
);
const fontSpec = `${fontScale}px ${fontFamily}`;
const weights: number[] = new Array(totalLen);
let runningText = '';
let previousAdvance = 0;
for (let index = 0; index < totalLen; index += 1) {
runningText += rawText[index];
const advance = measureTextWidth(fontSpec, runningText);
let width = advance - previousAdvance;
if (!Number.isFinite(width) || width <= 0) {
width = rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH;
}
weights[index] = width;
previousAdvance = advance;
}
if (!Number.isFinite(previousAdvance) || previousAdvance <= 0) {
for (let index = 0; index < totalLen; index += 1) {
weights[index] =
rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH;
}
}
const prefix: number[] = new Array(totalLen + 1);
prefix[0] = 0;
for (let index = 0; index < totalLen; index += 1) {
prefix[index + 1] = prefix[index] + weights[index];
}
const totalWeight = prefix[totalLen] || 1;
const rawX = item.transform[4];
const rawY = item.transform[5];
const transformed = [
viewport.convertToViewportPoint(rawX, rawY),
viewport.convertToViewportPoint(rawX + item.width, rawY),
viewport.convertToViewportPoint(rawX, rawY + item.height),
viewport.convertToViewportPoint(rawX + item.width, rawY + item.height),
];
const xs = transformed.map(([x]) => x);
const ys = transformed.map(([, y]) => y);
const left = Math.min(...xs);
const right = Math.max(...xs);
const top = Math.min(...ys);
const bottom = Math.max(...ys);
const [baselineStart, baselineEnd, verticalEnd] = transformed;
const baselineVector: [number, number] = [
baselineEnd[0] - baselineStart[0],
baselineEnd[1] - baselineStart[1],
];
const verticalVector: [number, number] = [
verticalEnd[0] - baselineStart[0],
verticalEnd[1] - baselineStart[1],
];
const hasOrientationVectors =
Math.hypot(baselineVector[0], baselineVector[1]) > 1e-6 &&
Math.hypot(verticalVector[0], verticalVector[1]) > 1e-6;
const tokens: CompareWordToken[] = [];
const wordRegex = /\S+/gu;
let match: RegExpExecArray | null;
let previousEnd = 0;
while ((match = wordRegex.exec(rawText)) !== null) {
const tokenText = match[0];
const normalizedWord = normalizeCompareText(tokenText);
if (!normalizedWord) {
previousEnd = match.index + tokenText.length;
continue;
}
const startIndex = match.index;
const endIndex = startIndex + tokenText.length;
const relStart = prefix[startIndex] / totalWeight;
const relEnd = prefix[endIndex] / totalWeight;
let wordLeft: number;
let wordRight: number;
let wordTop: number;
let wordBottom: number;
if (hasOrientationVectors) {
const segStart: [number, number] = [
baselineStart[0] + baselineVector[0] * relStart,
baselineStart[1] + baselineVector[1] * relStart,
];
const segEnd: [number, number] = [
baselineStart[0] + baselineVector[0] * relEnd,
baselineStart[1] + baselineVector[1] * relEnd,
];
const cornerPoints: Array<[number, number]> = [
segStart,
[segStart[0] + verticalVector[0], segStart[1] + verticalVector[1]],
[segEnd[0] + verticalVector[0], segEnd[1] + verticalVector[1]],
segEnd,
];
wordLeft = Math.min(...cornerPoints.map(([x]) => x));
wordRight = Math.max(...cornerPoints.map(([x]) => x));
wordTop = Math.min(...cornerPoints.map(([, y]) => y));
wordBottom = Math.max(...cornerPoints.map(([, y]) => y));
} else {
const segLeft = left + (right - left) * relStart;
const segRight = left + (right - left) * relEnd;
wordLeft = Math.min(segLeft, segRight);
wordRight = Math.max(segLeft, segRight);
wordTop = top;
wordBottom = bottom;
}
const width = Math.max(wordRight - wordLeft, 1);
const height = Math.max(wordBottom - wordTop, fallbackRect.height);
const gapText = rawText.slice(previousEnd, startIndex);
const previousToken = tokens[tokens.length - 1];
tokens.push({
word: normalizedWord,
compareWord: normalizedWord.toLowerCase(),
rect: {
x: Number.isFinite(wordLeft) ? wordLeft : fallbackRect.x,
y: Number.isFinite(wordTop) ? wordTop : fallbackRect.y,
width,
height,
},
joinsWithPrevious:
(gapText.length > 0 && !/\s/u.test(gapText)) ||
(previousToken
? shouldJoinTokenWithPrevious(previousToken.word, normalizedWord)
: false),
});
previousEnd = endIndex;
}
return tokens;
}
function toRect(
viewport: pdfjsLib.PageViewport,
item: PageTextItem,
index: number,
styles: TextStyles
) {
const normalizedText = normalizeCompareText(item.str);
const transformed = pdfjsLib.Util.transform(
viewport.transform,
item.transform
);
const width = Math.max(item.width * viewport.scale, 1);
const height = Math.max(
Math.abs(transformed[3]) || item.height * viewport.scale,
1
);
const x = transformed[4];
const y = transformed[5] - height;
const rect = {
x,
y,
width,
height,
};
return {
id: `${index}-${normalizedText}`,
text: item.str,
normalizedText,
rect,
wordTokens: buildItemWordTokens(viewport, item, rect, styles),
} satisfies CompareTextItem;
}
export function sortCompareTextItems(items: CompareTextItem[]) {
return [...items].sort((left, right) => {
const lineTolerance = Math.max(
Math.min(left.rect.height, right.rect.height) * 0.6,
4
);
const topDiff = left.rect.y - right.rect.y;
if (Math.abs(topDiff) > lineTolerance) {
return topDiff;
}
const xDiff = left.rect.x - right.rect.x;
if (Math.abs(xDiff) > 1) {
return xDiff;
}
return left.id.localeCompare(right.id);
});
}
function averageCharacterWidth(item: CompareTextItem) {
const compactText = item.normalizedText.replace(/\s+/g, '');
return item.rect.width / Math.max(compactText.length, 1);
}
function shouldInsertSpaceBetweenItems(
left: CompareTextItem,
right: CompareTextItem
) {
if (!left.normalizedText || !right.normalizedText) {
return false;
}
if (/^[,.;:!?%)\]}]/.test(right.normalizedText)) {
return false;
}
if (/^[''"']/u.test(right.normalizedText)) {
return false;
}
if (/[([{/"'-]$/u.test(left.normalizedText)) {
return false;
}
const gap = right.rect.x - (left.rect.x + left.rect.width);
if (gap <= 0) {
return false;
}
const leftWidth = averageCharacterWidth(left);
const rightWidth = averageCharacterWidth(right);
const threshold = Math.max(Math.min(leftWidth, rightWidth) * 0.45, 1.5);
return gap >= threshold;
}
function mergeLineText(lineItems: CompareTextItem[]): {
text: string;
charMap: CharPosition[];
} {
if (lineItems.length === 0) {
return { text: '', charMap: [] };
}
const charMap: CharPosition[] = [];
function pushFragChars(frag: CompareTextItem) {
const fragText = frag.normalizedText;
const fragCharWidth = frag.rect.width / Math.max(fragText.length, 1);
for (let ci = 0; ci < fragText.length; ci++) {
charMap.push({
x: frag.rect.x + ci * fragCharWidth,
width: fragCharWidth,
});
}
}
let merged = lineItems[0].normalizedText;
pushFragChars(lineItems[0]);
for (let index = 1; index < lineItems.length; index += 1) {
const previous = lineItems[index - 1];
const current = lineItems[index];
if (shouldInsertSpaceBetweenItems(previous, current)) {
const gap = current.rect.x - (previous.rect.x + previous.rect.width);
charMap.push({
x: previous.rect.x + previous.rect.width,
width: Math.max(gap, 1),
});
merged += ` ${current.normalizedText}`;
} else {
merged += current.normalizedText;
}
pushFragChars(current);
}
return { text: normalizeCompareText(merged), charMap };
}
function mergeWordTokenRects(
left: CompareWordToken,
right: CompareWordToken
): CompareWordToken {
const minX = Math.min(left.rect.x, right.rect.x);
const minY = Math.min(left.rect.y, right.rect.y);
const maxX = Math.max(
left.rect.x + left.rect.width,
right.rect.x + right.rect.width
);
const maxY = Math.max(
left.rect.y + left.rect.height,
right.rect.y + right.rect.height
);
return {
word: `${left.word}${right.word}`,
compareWord: `${left.compareWord}${right.compareWord}`,
rect: {
x: minX,
y: minY,
width: maxX - minX,
height: maxY - minY,
},
};
}
function buildMergedWordTokens(lineItems: CompareTextItem[]) {
if (
!lineItems.some((item) => item.wordTokens && item.wordTokens.length > 0)
) {
return undefined;
}
const mergedTokens: CompareWordToken[] = [];
let previousItem: CompareTextItem | null = null;
for (const item of lineItems) {
const itemTokens =
item.wordTokens && item.wordTokens.length > 0
? item.wordTokens
: [
{
word: item.normalizedText,
compareWord: item.normalizedText.toLowerCase(),
rect: item.rect,
} satisfies CompareWordToken,
];
itemTokens.forEach((token, tokenIndex) => {
const joinsAcrossItems =
tokenIndex === 0 && previousItem
? !shouldInsertSpaceBetweenItems(previousItem, item)
: false;
const shouldJoin =
mergedTokens.length > 0 &&
(tokenIndex > 0 ? Boolean(token.joinsWithPrevious) : joinsAcrossItems);
if (shouldJoin) {
mergedTokens[mergedTokens.length - 1] = mergeWordTokenRects(
mergedTokens[mergedTokens.length - 1],
token
);
} else {
mergedTokens.push({
word: token.word,
compareWord: token.compareWord,
rect: token.rect,
});
}
});
previousItem = item;
}
return mergedTokens;
}
export function mergeIntoLines(
sortedItems: CompareTextItem[]
): CompareTextItem[] {
if (sortedItems.length === 0) return [];
const lines: CompareTextItem[][] = [];
let currentLine: CompareTextItem[] = [sortedItems[0]];
for (let i = 1; i < sortedItems.length; i++) {
const anchor = currentLine[0];
const curr = sortedItems[i];
const lineTolerance = Math.max(
Math.min(anchor.rect.height, curr.rect.height) * 0.6,
4
);
if (Math.abs(curr.rect.y - anchor.rect.y) <= lineTolerance) {
currentLine.push(curr);
} else {
lines.push(currentLine);
currentLine = [curr];
}
}
lines.push(currentLine);
return lines.map((lineItems, lineIndex) => {
const { text: normalizedText, charMap } = mergeLineText(lineItems);
const minX = Math.min(...lineItems.map((item) => item.rect.x));
const minY = Math.min(...lineItems.map((item) => item.rect.y));
const maxX = Math.max(
...lineItems.map((item) => item.rect.x + item.rect.width)
);
const maxY = Math.max(
...lineItems.map((item) => item.rect.y + item.rect.height)
);
return {
id: `line-${lineIndex}`,
text: lineItems.map((item) => item.text).join(' '),
normalizedText,
rect: {
x: minX,
y: minY,
width: maxX - minX,
height: maxY - minY,
},
fragments: lineItems,
charMap,
wordTokens: buildMergedWordTokens(lineItems),
};
});
}
export async function extractPageModel(
page: pdfjsLib.PDFPageProxy,
viewport: pdfjsLib.PageViewport
): Promise<ComparePageModel> {
const textContent = await page.getTextContent({
disableCombineTextItems: true,
});
const styles = textContent.styles ?? {};
const rawItems = sortCompareTextItems(
textContent.items
.filter((item): item is PageTextItem => 'str' in item)
.map((item, index) => toRect(viewport, item, index, styles))
.filter((item) => item.normalizedText.length > 0)
);
const textItems = mergeIntoLines(rawItems);
return {
pageNumber: page.pageNumber,
width: viewport.width,
height: viewport.height,
textItems,
plainText: joinCompareTextItems(textItems),
hasText: textItems.length > 0,
source: 'pdfjs',
};
}

View File

@@ -0,0 +1,76 @@
import Tesseract from 'tesseract.js';
import type { ComparePageModel, CompareTextItem } from '../types.ts';
import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts';
import {
joinCompareTextItems,
normalizeCompareText,
} from './text-normalization.ts';
type OcrWord = {
text: string;
bbox: {
x0: number;
y0: number;
x1: number;
y1: number;
};
};
export async function recognizePageCanvas(
canvas: HTMLCanvasElement,
language: string,
onProgress?: (status: string, progress: number) => void
): Promise<ComparePageModel> {
const result = await Tesseract.recognize(canvas, language, {
logger(message) {
onProgress?.(message.status, message.progress || 0);
},
});
const ocrData = result.data as unknown as { words?: OcrWord[] };
const words = ((ocrData.words || []) as OcrWord[])
.map((word, index) => {
const normalizedText = normalizeCompareText(word.text || '');
if (!normalizedText) return null;
const item: CompareTextItem = {
id: `ocr-${index}-${normalizedText}`,
text: word.text,
normalizedText,
rect: {
x: word.bbox.x0,
y: word.bbox.y0,
width: Math.max(word.bbox.x1 - word.bbox.x0, 1),
height: Math.max(word.bbox.y1 - word.bbox.y0, 1),
},
wordTokens: [
{
word: normalizedText,
compareWord: normalizedText.toLowerCase(),
rect: {
x: word.bbox.x0,
y: word.bbox.y0,
width: Math.max(word.bbox.x1 - word.bbox.x0, 1),
height: Math.max(word.bbox.y1 - word.bbox.y0, 1),
},
},
],
};
return item;
})
.filter((word): word is CompareTextItem => Boolean(word));
const mergedItems = mergeIntoLines(sortCompareTextItems(words));
return {
pageNumber: 0,
width: canvas.width,
height: canvas.height,
textItems: mergedItems,
plainText: joinCompareTextItems(mergedItems),
hasText: mergedItems.length > 0,
source: 'ocr',
};
}

View File

@@ -0,0 +1,61 @@
import * as pdfjsLib from 'pdfjs-dist';
import type { ComparePageSignature, CompareTextItem } from '../types.ts';
import {
joinNormalizedText,
normalizeCompareText,
} from './text-normalization.ts';
type SignatureTextItem = {
str: string;
dir: string;
transform: number[];
width: number;
height: number;
fontName: string;
hasEOL: boolean;
};
function tokenToItem(token: string, index: number): CompareTextItem {
return {
id: `token-${index}-${token}`,
text: token,
normalizedText: token,
rect: { x: 0, y: 0, width: 0, height: 0 },
};
}
export async function extractPageSignature(
pdfDoc: pdfjsLib.PDFDocumentProxy,
pageNumber: number
): Promise<ComparePageSignature> {
const page = await pdfDoc.getPage(pageNumber);
const textContent = await page.getTextContent();
const tokens = textContent.items
.filter((item): item is SignatureTextItem => 'str' in item)
.map((item) => normalizeCompareText(item.str))
.filter(Boolean);
const limitedTokens = tokens.slice(0, 500);
return {
pageNumber,
plainText: joinNormalizedText(limitedTokens),
hasText: limitedTokens.length > 0,
tokenItems: limitedTokens.map((token, index) => tokenToItem(token, index)),
};
}
export async function extractDocumentSignatures(
pdfDoc: pdfjsLib.PDFDocumentProxy,
onProgress?: (pageNumber: number, totalPages: number) => void
) {
const signatures: ComparePageSignature[] = [];
for (let pageNumber = 1; pageNumber <= pdfDoc.numPages; pageNumber += 1) {
onProgress?.(pageNumber, pdfDoc.numPages);
signatures.push(await extractPageSignature(pdfDoc, pageNumber));
}
return signatures;
}

View File

@@ -0,0 +1,122 @@
import type { ComparePagePair, ComparePageSignature } from '../types.ts';
function tokenize(text: string) {
return new Set(text.split(/\s+/).filter(Boolean));
}
function similarityScore(
left: ComparePageSignature,
right: ComparePageSignature
) {
if (!left.hasText && !right.hasText) {
return left.pageNumber === right.pageNumber ? 0.7 : 0.35;
}
if (!left.hasText || !right.hasText) {
return 0.08;
}
const leftTokens = tokenize(left.plainText);
const rightTokens = tokenize(right.plainText);
const union = new Set([...leftTokens, ...rightTokens]);
let intersectionCount = 0;
leftTokens.forEach((token) => {
if (rightTokens.has(token)) intersectionCount += 1;
});
const jaccard = union.size === 0 ? 0 : intersectionCount / union.size;
const positionalBias = left.pageNumber === right.pageNumber ? 0.1 : 0;
return Math.min(jaccard + positionalBias, 1);
}
export function pairPages(
leftPages: ComparePageSignature[],
rightPages: ComparePageSignature[]
) {
const insertionCost = 0.8;
const rowCount = leftPages.length + 1;
const colCount = rightPages.length + 1;
const dp = Array.from({ length: rowCount }, () =>
Array<number>(colCount).fill(0)
);
const backtrack = Array.from({ length: rowCount }, () =>
Array<'match' | 'left' | 'right'>(colCount).fill('match')
);
for (let i = 1; i < rowCount; i += 1) {
dp[i][0] = i * insertionCost;
backtrack[i][0] = 'left';
}
for (let j = 1; j < colCount; j += 1) {
dp[0][j] = j * insertionCost;
backtrack[0][j] = 'right';
}
for (let i = 1; i < rowCount; i += 1) {
for (let j = 1; j < colCount; j += 1) {
const similarity = similarityScore(leftPages[i - 1], rightPages[j - 1]);
const matchCost = dp[i - 1][j - 1] + (1 - similarity);
const leftCost = dp[i - 1][j] + insertionCost;
const rightCost = dp[i][j - 1] + insertionCost;
const minCost = Math.min(matchCost, leftCost, rightCost);
dp[i][j] = minCost;
if (minCost === matchCost) {
backtrack[i][j] = 'match';
} else if (minCost === leftCost) {
backtrack[i][j] = 'left';
} else {
backtrack[i][j] = 'right';
}
}
}
const pairs: ComparePagePair[] = [];
let i = leftPages.length;
let j = rightPages.length;
while (i > 0 || j > 0) {
const direction = backtrack[i][j];
if (i > 0 && j > 0 && direction === 'match') {
const confidence = similarityScore(leftPages[i - 1], rightPages[j - 1]);
pairs.push({
pairIndex: 0,
leftPageNumber: leftPages[i - 1].pageNumber,
rightPageNumber: rightPages[j - 1].pageNumber,
confidence,
});
i -= 1;
j -= 1;
continue;
}
if (i > 0 && (j === 0 || direction === 'left')) {
pairs.push({
pairIndex: 0,
leftPageNumber: leftPages[i - 1].pageNumber,
rightPageNumber: null,
confidence: 0,
});
i -= 1;
continue;
}
if (j > 0) {
pairs.push({
pairIndex: 0,
leftPageNumber: null,
rightPageNumber: rightPages[j - 1].pageNumber,
confidence: 0,
});
j -= 1;
}
}
return pairs
.reverse()
.map((pair, index) => ({ ...pair, pairIndex: index + 1 }));
}

View File

@@ -0,0 +1,64 @@
import type { CompareTextItem } from '../types.ts';
export function normalizeCompareText(text: string) {
return text
.normalize('NFKC')
.replace(/[\u0000-\u001F\u007F-\u009F]/g, ' ')
.replace(/[\u{E000}-\u{F8FF}]/gu, ' ')
.replace(/\s+/g, ' ')
.trim();
}
function shouldAppendWithoutSpace(current: string, next: string) {
if (!current) return true;
if (/^[,.;:!?%)\]}]/.test(next)) return true;
if (/^["']$/.test(next)) return true;
if (/^[']/u.test(next)) return true;
if (/[([{/"'-]$/u.test(current)) return true;
return false;
}
export function joinNormalizedText(tokens: string[]) {
return tokens.reduce((result, token) => {
if (!token) return result;
if (shouldAppendWithoutSpace(result, token)) {
return `${result}${token}`;
}
return `${result} ${token}`;
}, '');
}
export function joinCompareTextItems(items: CompareTextItem[]) {
return joinNormalizedText(items.map((item) => item.normalizedText));
}
export function isLowQualityExtractedText(text: string) {
const normalized = normalizeCompareText(text);
if (!normalized) return true;
const tokens = normalized.split(/\s+/).filter(Boolean);
const visibleCharacters = Array.from(normalized).filter(
(character) => character.trim().length > 0
);
const alphaNumericCount = visibleCharacters.filter((character) =>
/[\p{L}\p{N}]/u.test(character)
).length;
const symbolCount = visibleCharacters.length - alphaNumericCount;
const tokenWithAlphaNumericCount = tokens.filter((token) =>
/[\p{L}\p{N}]/u.test(token)
).length;
if (alphaNumericCount === 0) return true;
if (
visibleCharacters.length >= 12 &&
alphaNumericCount / visibleCharacters.length < 0.45 &&
symbolCount / visibleCharacters.length > 0.35
) {
return true;
}
if (tokens.length >= 6 && tokenWithAlphaNumericCount / tokens.length < 0.6) {
return true;
}
return false;
}

View File

@@ -0,0 +1,134 @@
import pixelmatch from 'pixelmatch';
import type { CompareVisualDiff } from '../types.ts';
type FocusRegion = {
x: number;
y: number;
width: number;
height: number;
};
function createCanvas(width: number, height: number) {
const canvas = document.createElement('canvas');
canvas.width = width;
canvas.height = height;
return canvas;
}
function drawNormalized(
sourceCanvas: HTMLCanvasElement,
targetCanvas: HTMLCanvasElement
) {
const context = targetCanvas.getContext('2d');
if (!context) {
throw new Error('Could not create comparison canvas context.');
}
context.fillStyle = '#ffffff';
context.fillRect(0, 0, targetCanvas.width, targetCanvas.height);
const offsetX = Math.floor((targetCanvas.width - sourceCanvas.width) / 2);
const offsetY = Math.floor((targetCanvas.height - sourceCanvas.height) / 2);
context.drawImage(sourceCanvas, offsetX, offsetY);
}
export function renderVisualDiff(
canvas1: HTMLCanvasElement,
canvas2: HTMLCanvasElement,
outputCanvas: HTMLCanvasElement,
focusRegion?: FocusRegion
): CompareVisualDiff {
const width = Math.max(canvas1.width, canvas2.width, 1);
const height = Math.max(canvas1.height, canvas2.height, 1);
const normalizedCanvas1 = createCanvas(width, height);
const normalizedCanvas2 = createCanvas(width, height);
drawNormalized(canvas1, normalizedCanvas1);
drawNormalized(canvas2, normalizedCanvas2);
outputCanvas.width = width;
outputCanvas.height = height;
const context1 = normalizedCanvas1.getContext('2d');
const context2 = normalizedCanvas2.getContext('2d');
const outputContext = outputCanvas.getContext('2d');
if (!context1 || !context2 || !outputContext) {
throw new Error('Could not create visual diff context.');
}
const image1 = context1.getImageData(0, 0, width, height);
const image2 = context2.getImageData(0, 0, width, height);
const diffImage = outputContext.createImageData(width, height);
const mismatchPixels = pixelmatch(
image1.data,
image2.data,
diffImage.data,
width,
height,
{
threshold: 0.12,
includeAA: false,
alpha: 0.2,
diffMask: false,
diffColor: [239, 68, 68],
diffColorAlt: [34, 197, 94],
}
);
const overlayCanvas = createCanvas(width, height);
const overlayContext = overlayCanvas.getContext('2d');
if (!overlayContext) {
throw new Error('Could not create visual diff overlay context.');
}
overlayContext.putImageData(diffImage, 0, 0);
const region = focusRegion
? {
x: Math.max(Math.floor(focusRegion.x), 0),
y: Math.max(Math.floor(focusRegion.y), 0),
width: Math.min(Math.ceil(focusRegion.width), width),
height: Math.min(Math.ceil(focusRegion.height), height),
}
: { x: 0, y: 0, width, height };
outputCanvas.width = Math.max(region.width, 1);
outputCanvas.height = Math.max(region.height, 1);
outputContext.fillStyle = '#ffffff';
outputContext.fillRect(0, 0, outputCanvas.width, outputCanvas.height);
outputContext.drawImage(
normalizedCanvas2,
region.x,
region.y,
region.width,
region.height,
0,
0,
outputCanvas.width,
outputCanvas.height
);
outputContext.globalAlpha = 0.9;
outputContext.drawImage(
overlayCanvas,
region.x,
region.y,
region.width,
region.height,
0,
0,
outputCanvas.width,
outputCanvas.height
);
outputContext.globalAlpha = 1;
return {
mismatchPixels,
mismatchRatio: mismatchPixels / Math.max(width * height, 1),
hasDiff: mismatchPixels > 0,
};
}

View File

@@ -0,0 +1,77 @@
import type { ComparePagePair, ComparePageResult } from '../types.ts';
function escapeHtml(text: string) {
return text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#039;');
}
export function buildCompareReport(
fileName1: string,
fileName2: string,
pairs: ComparePagePair[],
results: ComparePageResult[]
) {
const totals = results.reduce(
(summary, result) => {
summary.added += result.summary.added;
summary.removed += result.summary.removed;
summary.modified += result.summary.modified;
return summary;
},
{ added: 0, removed: 0, modified: 0 }
);
const rows = results
.map((result, index) => {
const pair = pairs[index];
const changes = result.changes
.map(
(change) =>
`<li><strong>${escapeHtml(change.type)}</strong>: ${escapeHtml(change.description)}</li>`
)
.join('');
return `
<section class="pair-card">
<h2>Comparison ${pair?.pairIndex || index + 1}</h2>
<p class="meta">PDF 1 page: ${pair?.leftPageNumber ?? 'none'} | PDF 2 page: ${pair?.rightPageNumber ?? 'none'} | Confidence: ${((pair?.confidence || 0) * 100).toFixed(0)}%</p>
<p class="meta">Status: ${escapeHtml(result.status)}${result.usedOcr ? ' | OCR used' : ''}</p>
<p class="meta">Added: ${result.summary.added} | Removed: ${result.summary.removed} | Modified: ${result.summary.modified}</p>
<ul>${changes || '<li>No semantic changes detected.</li>'}</ul>
</section>
`;
})
.join('');
return `<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Compare report</title>
<style>
body { font-family: ui-sans-serif, system-ui, sans-serif; margin: 0; padding: 2rem; background: #111827; color: #e5e7eb; }
.summary { display: grid; grid-template-columns: repeat(3, minmax(0, 1fr)); gap: 1rem; margin: 1.5rem 0; }
.card, .pair-card { background: #1f2937; border: 1px solid #374151; border-radius: 12px; padding: 1rem 1.25rem; }
.pair-card { margin-bottom: 1rem; }
.meta { color: #9ca3af; font-size: 0.95rem; }
h1, h2 { margin: 0 0 0.75rem 0; }
ul { margin: 0.75rem 0 0 1.25rem; }
</style>
</head>
<body>
<h1>PDF Compare Report</h1>
<p class="meta">PDF 1: ${escapeHtml(fileName1)} | PDF 2: ${escapeHtml(fileName2)}</p>
<div class="summary">
<div class="card"><div class="meta">Added</div><div>${totals.added}</div></div>
<div class="card"><div class="meta">Removed</div><div>${totals.removed}</div></div>
<div class="card"><div class="meta">Modified</div><div>${totals.modified}</div></div>
</div>
${rows}
</body>
</html>`;
}

View File

@@ -0,0 +1,18 @@
import { buildCompareReport } from './build-report.ts';
import type { ComparePagePair, ComparePageResult } from '../types.ts';
export function exportCompareHtmlReport(
fileName1: string,
fileName2: string,
pairs: ComparePagePair[],
results: ComparePageResult[]
) {
const html = buildCompareReport(fileName1, fileName2, pairs, results);
const blob = new Blob([html], { type: 'text/html;charset=utf-8' });
const url = URL.createObjectURL(blob);
const anchor = document.createElement('a');
anchor.href = url;
anchor.download = 'bentopdf-compare-report.html';
anchor.click();
URL.revokeObjectURL(url);
}

113
src/js/compare/types.ts Normal file
View File

@@ -0,0 +1,113 @@
import type * as pdfjsLib from 'pdfjs-dist';
export type CompareViewMode = 'overlay' | 'side-by-side';
export interface CompareRectangle {
x: number;
y: number;
width: number;
height: number;
}
export interface CharPosition {
x: number;
width: number;
}
export interface CompareWordToken {
word: string;
compareWord: string;
rect: CompareRectangle;
joinsWithPrevious?: boolean;
}
export interface CompareTextItem {
id: string;
text: string;
normalizedText: string;
rect: CompareRectangle;
fragments?: CompareTextItem[];
charMap?: CharPosition[];
wordTokens?: CompareWordToken[];
}
export interface ComparePageModel {
pageNumber: number;
width: number;
height: number;
textItems: CompareTextItem[];
plainText: string;
hasText: boolean;
source: 'pdfjs' | 'ocr';
}
export interface ComparePageSignature {
pageNumber: number;
plainText: string;
hasText: boolean;
tokenItems: CompareTextItem[];
}
export interface ComparePagePair {
pairIndex: number;
leftPageNumber: number | null;
rightPageNumber: number | null;
confidence: number;
}
export interface CompareVisualDiff {
mismatchPixels: number;
mismatchRatio: number;
hasDiff: boolean;
}
export type CompareChangeType =
| 'added'
| 'removed'
| 'modified'
| 'page-added'
| 'page-removed';
export interface CompareTextChange {
id: string;
type: CompareChangeType;
description: string;
beforeText: string;
afterText: string;
beforeRects: CompareRectangle[];
afterRects: CompareRectangle[];
}
export interface CompareChangeSummary {
added: number;
removed: number;
modified: number;
}
export interface ComparePageResult {
status: 'match' | 'changed' | 'left-only' | 'right-only';
leftPageNumber: number | null;
rightPageNumber: number | null;
changes: CompareTextChange[];
summary: CompareChangeSummary;
visualDiff: CompareVisualDiff | null;
confidence?: number;
usedOcr?: boolean;
}
export type CompareFilterType = 'added' | 'removed' | 'modified' | 'all';
export interface CompareState {
pdfDoc1: pdfjsLib.PDFDocumentProxy | null;
pdfDoc2: pdfjsLib.PDFDocumentProxy | null;
currentPage: number;
viewMode: CompareViewMode;
isSyncScroll: boolean;
currentComparison: ComparePageResult | null;
activeChangeIndex: number;
pagePairs: ComparePagePair[];
activeFilter: CompareFilterType;
changeSearchQuery: string;
useOcr: boolean;
ocrLanguage: string;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,9 +1 @@
import * as pdfjsLib from 'pdfjs-dist'; export type { CompareState } from '../compare/types.ts';
export interface CompareState {
pdfDoc1: pdfjsLib.PDFDocumentProxy | null;
pdfDoc2: pdfjsLib.PDFDocumentProxy | null;
currentPage: number;
viewMode: 'overlay' | 'side-by-side';
isSyncScroll: boolean;
}

View File

@@ -72,31 +72,362 @@
<style> <style>
.compare-viewer-wrapper.overlay-mode { .compare-viewer-wrapper.overlay-mode {
position: relative; position: relative;
background: #ffffff;
overflow: hidden;
padding: 1.5rem;
} }
.compare-viewer-wrapper.overlay-mode #panel-1, .compare-viewer-wrapper.overlay-mode #panel-1 {
.compare-viewer-wrapper.overlay-mode #panel-2 {
position: absolute;
top: 0;
left: 0;
width: 100%; width: 100%;
height: 100%; height: 100%;
overflow: auto;
scrollbar-width: none;
-ms-overflow-style: none;
}
.compare-viewer-wrapper.overlay-mode #panel-1::-webkit-scrollbar {
display: none;
} }
.compare-viewer-wrapper.overlay-mode #panel-2 { .compare-viewer-wrapper.overlay-mode #panel-2 {
position: absolute;
inset: 1.5rem;
overflow: hidden;
pointer-events: none; pointer-events: none;
background: transparent;
}
.compare-viewer-wrapper.overlay-mode .compare-panel-label {
display: none;
}
.compare-viewer-wrapper.overlay-mode .compare-canvas-stage canvas {
position: static;
top: auto;
left: auto;
width: auto;
height: auto;
}
.compare-viewer-wrapper.overlay-mode
#panel-2
.compare-canvas-stage
canvas {
background: transparent;
} }
.compare-viewer-wrapper.side-by-side-mode { .compare-viewer-wrapper.side-by-side-mode {
display: flex; display: flex;
gap: 0;
padding: 0;
background: #ffffff;
}
.compare-workspace {
display: grid;
gap: 1rem; gap: 1rem;
grid-template-columns: minmax(0, 1fr) 20rem;
align-items: stretch;
} }
.compare-viewer-wrapper.side-by-side-mode #panel-1, .compare-viewer-wrapper.side-by-side-mode #panel-1,
.compare-viewer-wrapper.side-by-side-mode #panel-2 { .compare-viewer-wrapper.side-by-side-mode #panel-2 {
flex: 1; flex: 1;
overflow: auto; overflow: auto;
max-height: 70vh; min-height: 0;
height: 100%;
border: none;
border-radius: 0;
box-shadow: none;
}
.compare-viewer-wrapper.side-by-side-mode #panel-1 {
border-right: 2px solid #cbd5e1;
}
.compare-panel {
position: relative;
min-width: 0;
min-height: 0;
background: #ffffff;
}
.compare-panel-label {
position: sticky;
top: 0;
z-index: 5;
padding: 0.5rem 1rem;
font-size: 0.75rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.05em;
color: #64748b;
background: rgba(255, 255, 255, 0.95);
border-bottom: 1px solid #e2e8f0;
backdrop-filter: blur(4px);
}
.compare-canvas-stage {
--compare-stage-pad-top: 1.5rem;
--compare-stage-pad-x: 1.75rem;
--compare-stage-pad-bottom: 1.75rem;
position: relative;
width: max-content;
margin: 0 auto;
padding: var(--compare-stage-pad-top) var(--compare-stage-pad-x)
var(--compare-stage-pad-bottom);
}
.compare-canvas-stage canvas {
display: block;
background: #ffffff;
}
.compare-highlight-layer {
position: absolute;
inset: var(--compare-stage-pad-top) var(--compare-stage-pad-x)
var(--compare-stage-pad-bottom);
pointer-events: none;
}
.compare-highlight {
position: absolute;
border-radius: 2px;
border: none;
}
.compare-highlight.added {
background: rgba(34, 197, 94, 0.28);
}
.compare-highlight.removed,
.compare-highlight.page-removed {
background: rgba(239, 68, 68, 0.28);
}
.compare-highlight.modified {
background: rgba(245, 158, 11, 0.28);
}
.compare-highlight.active {
outline: 2px solid rgba(99, 102, 241, 0.7);
outline-offset: 1px;
}
.compare-placeholder {
position: absolute;
inset: 2rem;
display: flex;
align-items: center;
justify-content: center;
text-align: center;
padding: 1rem;
border: 1px dashed #94a3b8;
border-radius: 0.75rem;
color: #475569;
background: rgba(255, 255, 255, 0.92);
}
.compare-placeholder.hidden {
display: none;
}
.compare-change-item.active {
border-color: #818cf8;
background: rgba(79, 70, 229, 0.12);
}
.compare-sidebar {
display: flex;
flex-direction: column;
gap: 0;
background: rgba(15, 23, 42, 0.6);
border: 1px solid rgba(51, 65, 85, 0.5);
border-radius: 0.75rem;
overflow: hidden;
height: clamp(36rem, 82vh, 72rem);
}
.compare-sidebar-header {
padding: 0.75rem 1rem;
border-bottom: 1px solid rgba(51, 65, 85, 0.5);
}
.compare-sidebar-filters {
display: flex;
align-items: center;
gap: 0.375rem;
padding: 0.625rem 1rem;
border-bottom: 1px solid rgba(51, 65, 85, 0.4);
flex-wrap: wrap;
}
.compare-pill {
display: inline-flex;
align-items: center;
gap: 0.25rem;
border-radius: 9999px;
padding: 0.25rem 0.625rem;
font-size: 0.6875rem;
font-weight: 600;
border: 1px solid transparent;
cursor: pointer;
transition: all 0.15s;
white-space: nowrap;
}
.compare-pill.removed {
color: #fca5a5;
background: rgba(239, 68, 68, 0.1);
border-color: rgba(239, 68, 68, 0.15);
}
.compare-pill.removed.active {
background: rgba(239, 68, 68, 0.25);
border-color: rgba(239, 68, 68, 0.5);
}
.compare-pill.modified {
color: #fcd34d;
background: rgba(245, 158, 11, 0.1);
border-color: rgba(245, 158, 11, 0.15);
}
.compare-pill.modified.active {
background: rgba(245, 158, 11, 0.25);
border-color: rgba(245, 158, 11, 0.5);
}
.compare-pill.added {
color: #86efac;
background: rgba(34, 197, 94, 0.1);
border-color: rgba(34, 197, 94, 0.15);
}
.compare-pill.added.active {
background: rgba(34, 197, 94, 0.25);
border-color: rgba(34, 197, 94, 0.5);
}
.compare-change-list {
flex: 1;
min-height: 0;
overflow-y: auto;
display: flex;
flex-direction: column;
gap: 1rem;
padding: 1rem;
}
.compare-change-item {
display: flex;
align-items: flex-start;
gap: 0.625rem;
padding: 0.75rem 1rem;
cursor: pointer;
transition: background 0.1s;
border: 1px solid rgba(51, 65, 85, 0.3);
border-left: 2px solid transparent;
border-radius: 0.5rem;
font-size: 0.8125rem;
color: #cbd5e1;
line-height: 1.4;
}
.compare-change-item:hover {
background: rgba(99, 102, 241, 0.08);
}
.compare-change-item.active {
background: rgba(99, 102, 241, 0.15);
border-left: 2px solid #818cf8;
}
.compare-change-dot {
width: 0.5rem;
height: 0.5rem;
border-radius: 50%;
flex-shrink: 0;
}
.compare-change-item .compare-change-dot {
margin-top: 0.35rem;
}
.compare-change-dot.added {
background: #22c55e;
}
.compare-change-dot.removed {
background: #ef4444;
}
.compare-change-dot.modified {
background: #f59e0b;
}
.compare-change-dot.page-added {
background: #22c55e;
}
.compare-change-dot.page-removed {
background: #ef4444;
}
.compare-change-desc {
flex: 1;
min-width: 0;
}
.compare-change-desc-text {
white-space: normal;
overflow-wrap: anywhere;
}
.compare-change-type {
font-size: 0.625rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.04em;
flex-shrink: 0;
margin-top: 0.2rem;
}
.compare-change-type.added,
.compare-change-type.page-added {
color: #86efac;
}
.compare-change-type.removed,
.compare-change-type.page-removed {
color: #fca5a5;
}
.compare-change-type.modified {
color: #fcd34d;
}
.compare-change-empty {
padding: 2rem 1rem;
font-size: 0.8125rem;
color: #64748b;
text-align: center;
}
@media (max-width: 1023px) {
.compare-workspace {
grid-template-columns: minmax(0, 1fr);
}
.compare-sidebar {
height: auto;
max-height: 20rem;
}
.compare-viewer-wrapper.side-by-side-mode {
gap: 1rem;
padding: 1rem;
}
.compare-canvas-stage {
--compare-stage-pad-top: 1rem;
--compare-stage-pad-x: 1rem;
--compare-stage-pad-bottom: 1rem;
}
} }
</style> </style>
@@ -134,7 +465,7 @@
> >
<div <div
id="tool-uploader" id="tool-uploader"
class="bg-gray-800 rounded-xl shadow-xl px-4 py-8 md:p-8 max-w-5xl w-full text-gray-200 border border-gray-700" class="bg-gray-800 rounded-xl shadow-xl px-4 py-8 md:p-8 max-w-[96rem] w-full text-gray-200 border border-gray-700"
> >
<button <button
id="back-to-tools" id="back-to-tools"
@@ -200,69 +531,76 @@
<!-- Compare Viewer (hidden until both files loaded) --> <!-- Compare Viewer (hidden until both files loaded) -->
<div id="compare-viewer" class="hidden"> <div id="compare-viewer" class="hidden">
<!-- Unified Toolbar --> <!-- Toolbar -->
<div <div
class="flex flex-wrap items-center justify-center gap-4 mb-4 p-3 bg-gray-900 rounded-lg border border-gray-700" class="flex flex-wrap items-center gap-3 mb-3 p-2 bg-gray-900 rounded-lg border border-gray-700"
> >
<!-- Page Navigation -->
<button <button
id="prev-page-compare" id="prev-page-compare"
class="btn p-2 rounded-full bg-gray-700 hover:bg-gray-600 disabled:opacity-50" class="btn p-1.5 rounded bg-gray-700 hover:bg-gray-600 disabled:opacity-50"
disabled disabled
title="Previous page"
> >
<i data-lucide="chevron-left"></i> <i data-lucide="chevron-left" class="w-4 h-4"></i>
</button> </button>
<span class="text-white font-medium"> <span class="text-sm text-white font-medium">
Page <span id="current-page-display-compare">1</span> of Page <span id="current-page-display-compare">1</span> /
<span id="total-pages-display-compare">1</span> <span id="total-pages-display-compare">1</span>
</span> </span>
<button <button
id="next-page-compare" id="next-page-compare"
class="btn p-2 rounded-full bg-gray-700 hover:bg-gray-600 disabled:opacity-50" class="btn p-1.5 rounded bg-gray-700 hover:bg-gray-600 disabled:opacity-50"
disabled disabled
title="Next page"
> >
<i data-lucide="chevron-right"></i> <i data-lucide="chevron-right" class="w-4 h-4"></i>
</button> </button>
<!-- Divider --> <div class="border-l border-gray-700 h-5 mx-1"></div>
<div
class="border-l border-gray-600 h-6 mx-2 hidden sm:block"
></div>
<!-- View Mode Buttons --> <div class="bg-gray-700 p-0.5 rounded flex gap-0.5">
<div class="bg-gray-700 p-1 rounded-md flex gap-1">
<button <button
id="view-mode-overlay" id="view-mode-overlay"
class="btn bg-indigo-600 px-3 py-1 rounded text-sm font-semibold" class="btn bg-indigo-600 px-2.5 py-1 rounded text-xs font-semibold"
> >
Overlay Overlay
</button> </button>
<button <button
id="view-mode-side" id="view-mode-side"
class="btn px-3 py-1 rounded text-sm font-semibold" class="btn px-2.5 py-1 rounded text-xs font-semibold"
> >
Side-by-Side Side-by-Side
</button> </button>
</div> </div>
<!-- Divider --> <div class="border-l border-gray-700 h-5 mx-1"></div>
<div
class="border-l border-gray-600 h-6 mx-2 hidden sm:block"
></div>
<!-- Overlay Controls --> <button
<div id="overlay-controls" class="flex items-center gap-2"> id="prev-change-btn"
class="btn bg-gray-700 hover:bg-gray-600 px-2.5 py-1 rounded text-xs font-semibold disabled:opacity-50"
disabled
title="Previous change"
>
<i data-lucide="chevron-up" class="w-3.5 h-3.5"></i>
</button>
<button
id="next-change-btn"
class="btn bg-gray-700 hover:bg-gray-600 px-2.5 py-1 rounded text-xs font-semibold disabled:opacity-50"
disabled
title="Next change"
>
<i data-lucide="chevron-down" class="w-3.5 h-3.5"></i>
</button>
<div class="flex-1"></div>
<div id="overlay-controls" class="hidden flex items-center gap-2">
<button <button
id="flicker-btn" id="flicker-btn"
class="btn bg-gray-700 hover:bg-gray-600 px-3 py-1 rounded-md text-sm font-semibold" class="btn bg-gray-700 hover:bg-gray-600 px-2.5 py-1 rounded text-xs font-semibold"
> >
Flicker Flicker
</button> </button>
<label
for="opacity-slider"
class="text-sm font-medium text-gray-300"
>Opacity:</label
>
<input <input
type="range" type="range"
id="opacity-slider" id="opacity-slider"
@@ -270,46 +608,129 @@
max="1" max="1"
step="0.05" step="0.05"
value="0.5" value="0.5"
class="w-24 accent-indigo-500" class="w-20 accent-indigo-500"
/> />
</div> </div>
<!-- Side-by-side Controls (hidden initially) --> <div id="side-by-side-controls" class="flex items-center gap-2">
<div
id="side-by-side-controls"
class="hidden flex items-center gap-2"
>
<label <label
class="flex items-center gap-2 text-sm font-medium text-gray-300 cursor-pointer" class="flex items-center gap-1.5 text-xs text-gray-300 cursor-pointer"
> >
<input <input
type="checkbox" type="checkbox"
id="sync-scroll-toggle" id="sync-scroll-toggle"
checked checked
class="w-4 h-4 rounded text-indigo-600 bg-gray-700 border-gray-600 focus:ring-indigo-500" class="w-3.5 h-3.5 rounded text-indigo-600 bg-gray-700 border-gray-600"
/> />
Sync Scrolling Sync scroll
</label> </label>
</div> </div>
<button
id="export-report-btn"
class="btn bg-gray-700 hover:bg-gray-600 p-1.5 rounded disabled:opacity-50"
disabled
title="Export report"
>
<i data-lucide="download" class="w-4 h-4"></i>
</button>
</div> </div>
<!-- Viewer Wrapper --> <div class="compare-workspace">
<div <div
id="compare-viewer-wrapper" id="compare-viewer-wrapper"
class="compare-viewer-wrapper overlay-mode bg-gray-900 rounded-lg border border-gray-700 min-h-[400px] relative" class="compare-viewer-wrapper side-by-side-mode border border-slate-200 relative"
> >
<div id="panel-1" class="overflow-auto"> <div id="panel-1" class="compare-panel overflow-auto">
<canvas id="canvas-compare-1" class="block mx-auto"></canvas> <div class="compare-panel-label" id="compare-panel-label-1">
Original
</div> </div>
<div id="panel-2" class="overflow-auto"> <div class="compare-canvas-stage">
<canvas id="canvas-compare-1" class="block mx-auto"></canvas>
<div id="highlights-1" class="compare-highlight-layer"></div>
<div
id="placeholder-1"
class="compare-placeholder hidden"
></div>
</div>
</div>
<div id="panel-2" class="compare-panel overflow-auto">
<div class="compare-panel-label" id="compare-panel-label-2">
Modified
</div>
<div class="compare-canvas-stage">
<canvas <canvas
id="canvas-compare-2" id="canvas-compare-2"
class="block mx-auto" class="block mx-auto"
style="opacity: 0.5" style="opacity: 1"
></canvas> ></canvas>
<div id="highlights-2" class="compare-highlight-layer"></div>
<div
id="placeholder-2"
class="compare-placeholder hidden"
></div>
</div> </div>
</div> </div>
</div> </div>
<aside class="compare-sidebar">
<div class="compare-sidebar-header">
<div class="relative">
<span
class="absolute inset-y-0 left-0 flex items-center pl-3"
>
<i data-lucide="search" class="w-4 h-4 text-gray-400"></i>
</span>
<input
type="text"
id="compare-search-input"
placeholder="Search changes..."
class="w-full pl-9 pr-3 py-2 bg-gray-700 text-white text-sm border border-gray-600 rounded-lg focus:ring-indigo-500 focus:border-indigo-500"
/>
</div>
</div>
<div class="compare-sidebar-filters">
<button id="filter-removed" class="compare-pill removed">
<span class="compare-change-dot removed"></span>
<span id="summary-removed-count">0</span> Deleted
</button>
<button id="filter-added" class="compare-pill added">
<span class="compare-change-dot added"></span>
<span id="summary-added-count">0</span> Added
</button>
<button id="filter-modified" class="compare-pill modified">
<span class="compare-change-dot modified"></span>
<span id="summary-modified-count">0</span> Modified
</button>
<label
class="compare-pill"
style="
color: #94a3b8;
background: rgba(51, 65, 85, 0.3);
border-color: rgba(51, 65, 85, 0.4);
cursor: pointer;
"
>
<input
id="ocr-toggle"
type="checkbox"
checked
class="w-3 h-3 rounded text-indigo-600 bg-gray-700 border-gray-600"
/>
OCR
</label>
</div>
<div class="compare-change-list">
<div id="change-list-empty" class="compare-change-empty">
Upload two PDFs to see differences.
</div>
<div id="compare-change-list" class="hidden"></div>
</div>
</aside>
</div>
</div>
</div> </div>
</div> </div>

View File

@@ -0,0 +1,313 @@
import { describe, expect, it } from 'vitest';
import { comparePageModels } from '@/js/compare/engine/compare-page-models.ts';
import { diffTextRuns } from '@/js/compare/engine/diff-text-runs.ts';
import {
mergeIntoLines,
sortCompareTextItems,
} from '@/js/compare/engine/extract-page-model.ts';
import type { ComparePageModel, CompareTextItem } from '@/js/compare/types.ts';
function makeItem(id: string, text: string): CompareTextItem {
return {
id,
text,
normalizedText: text,
rect: { x: 0, y: 0, width: 10, height: 10 },
};
}
function makePage(
pageNumber: number,
textItems: CompareTextItem[]
): ComparePageModel {
return {
pageNumber,
width: 100,
height: 100,
textItems,
plainText: textItems.map((item) => item.normalizedText).join(' '),
hasText: textItems.length > 0,
source: 'pdfjs',
};
}
describe('diffTextRuns', () => {
it('detects modified tokens as one change', () => {
const result = diffTextRuns(
[makeItem('a', 'Hello'), makeItem('b', 'world')],
[makeItem('a', 'Hello'), makeItem('c', 'there')]
);
expect(result.summary).toEqual({ added: 0, removed: 0, modified: 1 });
expect(result.changes).toHaveLength(1);
expect(result.changes[0].type).toBe('modified');
expect(result.changes[0].beforeText).toBe('world');
expect(result.changes[0].afterText).toBe('there');
});
it('detects added tokens', () => {
const result = diffTextRuns(
[makeItem('a', 'Hello')],
[makeItem('a', 'Hello'), makeItem('b', 'again')]
);
expect(result.summary).toEqual({ added: 1, removed: 0, modified: 0 });
expect(result.changes[0].type).toBe('added');
});
it('splits compound replacements into discrete changes', () => {
const result = diffTextRuns(
[
makeItem('a', 'This'),
makeItem('b', 'is'),
makeItem('c', 'an'),
makeItem('d', 'example'),
makeItem('e', 'of'),
makeItem('f', 'a'),
makeItem('g', 'data'),
makeItem('h', 'table'),
makeItem('i', 'new.'),
makeItem('j', 'Disabilit'),
],
[
makeItem('k', 'Example'),
makeItem('l', 'table'),
makeItem('m', 'This'),
makeItem('n', 'is'),
makeItem('o', 'an'),
makeItem('p', 'example'),
makeItem('q', 'of'),
makeItem('r', 'a'),
makeItem('s', 'data'),
makeItem('t', 'table.'),
makeItem('u', 'Disability'),
]
);
expect(result.changes).toHaveLength(2);
expect(result.summary).toEqual({ added: 1, removed: 0, modified: 1 });
expect(
result.changes.some(
(change) =>
change.type === 'added' && change.afterText === 'Example table'
)
).toBe(true);
expect(
result.changes.some(
(change) =>
change.type === 'modified' &&
change.beforeText === 'table new. Disabilit' &&
change.afterText === 'table. Disability'
)
).toBe(true);
});
});
describe('comparePageModels', () => {
it('marks pages missing from the second document', () => {
const result = comparePageModels(
makePage(3, [makeItem('a', 'Only')]),
null
);
expect(result.status).toBe('left-only');
expect(result.summary.removed).toBe(1);
expect(result.changes[0].type).toBe('page-removed');
});
});
describe('sortCompareTextItems', () => {
it('orders tokens by reading order', () => {
const items: CompareTextItem[] = [
{
...makeItem('b', 'Body'),
rect: { x: 60, y: 40, width: 10, height: 10 },
},
{
...makeItem('a', 'Title'),
rect: { x: 10, y: 10, width: 10, height: 10 },
},
{
...makeItem('c', 'Next'),
rect: { x: 10, y: 40, width: 10, height: 10 },
},
];
expect(
sortCompareTextItems(items).map((item) => item.normalizedText)
).toEqual(['Title', 'Next', 'Body']);
});
});
describe('mergeIntoLines', () => {
it('merges items on the same Y-line into one item', () => {
const items: CompareTextItem[] = [
{
id: '0',
text: 'Hello',
normalizedText: 'Hello',
rect: { x: 0, y: 10, width: 50, height: 12 },
},
{
id: '1',
text: 'World',
normalizedText: 'World',
rect: { x: 60, y: 10, width: 50, height: 12 },
},
];
const merged = mergeIntoLines(sortCompareTextItems(items));
expect(merged).toHaveLength(1);
expect(merged[0].normalizedText).toBe('Hello World');
expect(merged[0].rect.x).toBe(0);
expect(merged[0].rect.width).toBe(110);
});
it('does not insert spaces inside a split word', () => {
const items: CompareTextItem[] = [
{
id: '0',
text: 'sam',
normalizedText: 'sam',
rect: { x: 0, y: 10, width: 24, height: 12 },
},
{
id: '1',
text: 'e',
normalizedText: 'e',
rect: { x: 24.4, y: 10, width: 8, height: 12 },
},
];
const merged = mergeIntoLines(sortCompareTextItems(items));
expect(merged).toHaveLength(1);
expect(merged[0].normalizedText).toBe('same');
});
it('keeps items on different Y-lines separate', () => {
const items: CompareTextItem[] = [
{
id: '0',
text: 'Line 1',
normalizedText: 'Line 1',
rect: { x: 0, y: 10, width: 50, height: 12 },
},
{
id: '1',
text: 'Line 2',
normalizedText: 'Line 2',
rect: { x: 0, y: 30, width: 50, height: 12 },
},
];
const merged = mergeIntoLines(sortCompareTextItems(items));
expect(merged).toHaveLength(2);
expect(merged[0].normalizedText).toBe('Line 1');
expect(merged[1].normalizedText).toBe('Line 2');
});
it('produces same result for different text run boundaries', () => {
const pdf1Items: CompareTextItem[] = [
{
id: '0',
text: 'Hello World',
normalizedText: 'Hello World',
rect: { x: 0, y: 10, width: 100, height: 12 },
},
];
const pdf2Items: CompareTextItem[] = [
{
id: '0',
text: 'Hello',
normalizedText: 'Hello',
rect: { x: 0, y: 10, width: 45, height: 12 },
},
{
id: '1',
text: 'World',
normalizedText: 'World',
rect: { x: 55, y: 10, width: 45, height: 12 },
},
];
const merged1 = mergeIntoLines(sortCompareTextItems(pdf1Items));
const merged2 = mergeIntoLines(sortCompareTextItems(pdf2Items));
expect(merged1[0].normalizedText).toBe(merged2[0].normalizedText);
const result = diffTextRuns(merged1, merged2);
expect(result.changes).toHaveLength(0);
});
it('detects actual changes after merging', () => {
const pdf1Items: CompareTextItem[] = [
{
id: '0',
text: 'Sample',
normalizedText: 'Sample',
rect: { x: 0, y: 10, width: 60, height: 14 },
},
{
id: '1',
text: 'page text here',
normalizedText: 'page text here',
rect: { x: 0, y: 30, width: 120, height: 14 },
},
];
const pdf2Items: CompareTextItem[] = [
{
id: '0',
text: 'Sample',
normalizedText: 'Sample',
rect: { x: 0, y: 10, width: 45, height: 14 },
},
{
id: '1',
text: 'PDF',
normalizedText: 'PDF',
rect: { x: 55, y: 10, width: 30, height: 14 },
},
{
id: '2',
text: 'pages text here',
normalizedText: 'pages text here',
rect: { x: 0, y: 30, width: 125, height: 14 },
},
];
const merged1 = mergeIntoLines(sortCompareTextItems(pdf1Items));
const merged2 = mergeIntoLines(sortCompareTextItems(pdf2Items));
expect(merged1).toHaveLength(2);
expect(merged2).toHaveLength(2);
const result = diffTextRuns(merged1, merged2);
expect(result.summary.modified).toBe(1);
expect(result.summary.added).toBe(0);
expect(result.summary.removed).toBe(0);
expect(result.changes).toHaveLength(1);
expect(result.changes[0].beforeText).toBe('page');
expect(result.changes[0].afterText).toBe('PDF pages');
});
it('preserves original casing in change descriptions', () => {
const result = diffTextRuns(
[makeItem('a', 'Sample')],
[makeItem('b', 'Sample PDF')]
);
expect(result.changes[0].afterText).toBe('PDF');
});
it('ignores joined versus split words when collapsed text matches', () => {
const result = diffTextRuns(
[makeItem('a', 'non'), makeItem('b', 'tincidunt')],
[makeItem('c', 'nontincidunt')]
);
expect(result.changes).toHaveLength(0);
expect(result.summary).toEqual({ added: 0, removed: 0, modified: 0 });
});
});

View File

@@ -0,0 +1,42 @@
import { describe, expect, it } from 'vitest';
import { pairPages } from '@/js/compare/engine/pair-pages.ts';
import type { ComparePageSignature } from '@/js/compare/types.ts';
function signature(pageNumber: number, text: string): ComparePageSignature {
return {
pageNumber,
plainText: text,
hasText: text.length > 0,
tokenItems: text
.split(/\s+/)
.filter(Boolean)
.map((token, index) => ({
id: `${pageNumber}-${index}`,
text: token,
normalizedText: token,
rect: { x: 0, y: 0, width: 0, height: 0 },
})),
};
}
describe('pairPages', () => {
it('pairs reordered and inserted pages without collapsing alignment', () => {
const pairs = pairPages(
[signature(1, 'alpha beta'), signature(2, 'gamma delta')],
[
signature(1, 'intro page'),
signature(2, 'alpha beta'),
signature(3, 'gamma delta'),
]
);
expect(pairs).toHaveLength(3);
expect(pairs[0]).toMatchObject({
leftPageNumber: null,
rightPageNumber: 1,
});
expect(pairs[1]).toMatchObject({ leftPageNumber: 1, rightPageNumber: 2 });
expect(pairs[2]).toMatchObject({ leftPageNumber: 2, rightPageNumber: 3 });
});
});

View File

@@ -0,0 +1,29 @@
import { describe, expect, it } from 'vitest';
import {
isLowQualityExtractedText,
joinNormalizedText,
normalizeCompareText,
} from '@/js/compare/engine/text-normalization.ts';
describe('text normalization', () => {
it('joins punctuation without inserting stray spaces', () => {
expect(joinNormalizedText(['Example', 'table', ':', 'v2'])).toBe(
'Example table: v2'
);
expect(joinNormalizedText(['"', 'Quoted', 'text', '"'])).toBe(
'"Quoted text"'
);
});
it('normalizes private-use and control characters away', () => {
expect(normalizeCompareText('A\u0000B\uE000C')).toBe('A B C');
});
it('flags punctuation-heavy extraction as low quality', () => {
expect(isLowQualityExtractedText('! " # $ % & \'')).toBe(true);
expect(isLowQualityExtractedText('Example table 2026 revision')).toBe(
false
);
});
});