diff --git a/package-lock.json b/package-lock.json index 53e966e..a70643d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "bento-pdf", - "version": "2.4.0", + "version": "2.4.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "bento-pdf", - "version": "2.4.0", + "version": "2.4.1", "license": "AGPL-3.0-only", "dependencies": { "@fontsource/cedarville-cursive": "^5.2.7", @@ -30,6 +30,7 @@ "blob-stream": "^0.1.3", "bwip-js": "^4.8.0", "cropperjs": "^1.6.2", + "diff": "^8.0.3", "embedpdf-snippet": "file:vendor/embedpdf/embedpdf-snippet-2.3.0.tgz", "heic2any": "^0.0.4", "highlight.js": "^11.11.1", @@ -55,11 +56,13 @@ "markdown-it-task-lists": "^2.1.1", "markdown-it-toc-done-right": "^4.2.0", "mermaid": "^11.12.3", + "microdiff": "^1.5.0", "node-forge": "^1.3.3", "papaparse": "^5.5.3", "pdf-lib": "^1.17.1", "pdfjs-dist": "^5.4.624", "pdfkit": "^0.17.2", + "pixelmatch": "^7.1.0", "postal-mime": "^2.7.3", "rete": "^2.0.6", "rete-area-plugin": "^2.1.5", @@ -6353,6 +6356,15 @@ "integrity": "sha512-ED3jP8saaweFTjeGX8HQPjeC1YYyZs98jGNZx6IiBvxW7JG5v492kamAQB3m2wop07CvU/RQmzcKr6bgcC5D/Q==", "license": "MIT" }, + "node_modules/diff": { + "version": "8.0.3", + "resolved": "https://registry.npmjs.org/diff/-/diff-8.0.3.tgz", + "integrity": "sha512-qejHi7bcSD4hQAZE0tNAawRK1ZtafHDmMTMkrrIGgSLl7hTnQHmKCeB45xAcbfTqK2zowkM3j3bHt/4b/ARbYQ==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.3.1" + } + }, "node_modules/diffie-hellman": { "version": "5.0.3", "resolved": "https://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz", @@ -9068,6 +9080,12 @@ "uuid": "^11.1.0" } }, + "node_modules/microdiff": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/microdiff/-/microdiff-1.5.0.tgz", + "integrity": "sha512-Drq+/THMvDdzRYrK0oxJmOKiC24ayUV8ahrt8l3oRK51PWt6gdtrIGrlIH3pT/lFh1z93FbAcidtsHcWbnRz8Q==", + "license": "MIT" + }, "node_modules/micromark-util-character": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.1.tgz", @@ -9896,6 +9914,18 @@ "url": "https://github.com/sponsors/jonschlinkert" } }, + "node_modules/pixelmatch": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/pixelmatch/-/pixelmatch-7.1.0.tgz", + "integrity": "sha512-1wrVzJ2STrpmONHKBy228LM1b84msXDUoAzVEl0R8Mz4Ce6EPr+IVtxm8+yvrqLYMHswREkjYFaMxnyGnaY3Ng==", + "license": "ISC", + "dependencies": { + "pngjs": "^7.0.0" + }, + "bin": { + "pixelmatch": "bin/pixelmatch" + } + }, "node_modules/pkg-dir": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-5.0.0.tgz", @@ -9925,6 +9955,15 @@ "resolved": "https://registry.npmjs.org/png-js/-/png-js-1.0.0.tgz", "integrity": "sha512-k+YsbhpA9e+EFfKjTCH3VW6aoKlyNYI6NYdTfDL4CIvFnvsuO84ttonmZE7rc+v23SLTH8XX+5w/Ak9v0xGY4g==" }, + "node_modules/pngjs": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/pngjs/-/pngjs-7.0.0.tgz", + "integrity": "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow==", + "license": "MIT", + "engines": { + "node": ">=14.19.0" + } + }, "node_modules/points-on-curve": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/points-on-curve/-/points-on-curve-0.2.0.tgz", diff --git a/package.json b/package.json index 1e2f653..81929d7 100644 --- a/package.json +++ b/package.json @@ -86,6 +86,7 @@ "blob-stream": "^0.1.3", "bwip-js": "^4.8.0", "cropperjs": "^1.6.2", + "diff": "^8.0.3", "embedpdf-snippet": "file:vendor/embedpdf/embedpdf-snippet-2.3.0.tgz", "heic2any": "^0.0.4", "highlight.js": "^11.11.1", @@ -111,11 +112,13 @@ "markdown-it-task-lists": "^2.1.1", "markdown-it-toc-done-right": "^4.2.0", "mermaid": "^11.12.3", + "microdiff": "^1.5.0", "node-forge": "^1.3.3", "papaparse": "^5.5.3", "pdf-lib": "^1.17.1", "pdfjs-dist": "^5.4.624", "pdfkit": "^0.17.2", + "pixelmatch": "^7.1.0", "postal-mime": "^2.7.3", "rete": "^2.0.6", "rete-area-plugin": "^2.1.5", diff --git a/src/css/styles.css b/src/css/styles.css index 304a523..26968d3 100644 --- a/src/css/styles.css +++ b/src/css/styles.css @@ -238,19 +238,6 @@ input[type='file']::file-selector-button { position: relative; width: 100%; height: 75vh; - overflow: auto; - border: 2px solid #374151; - border-radius: 0.5rem; - background-color: #1f2937; -} - -/* This rule now ONLY applies to canvases in overlay mode */ -.compare-viewer-wrapper.overlay-mode canvas { - position: absolute; - top: 0; - left: 0; - width: 100%; - height: auto; } .compare-viewer-wrapper.side-by-side-mode { diff --git a/src/js/compare/engine/compare-page-models.ts b/src/js/compare/engine/compare-page-models.ts new file mode 100644 index 0000000..21d7b63 --- /dev/null +++ b/src/js/compare/engine/compare-page-models.ts @@ -0,0 +1,78 @@ +import type { ComparePageModel, ComparePageResult } from '../types.ts'; +import { diffTextRuns } from './diff-text-runs.ts'; + +export function comparePageModels( + leftPage: ComparePageModel | null, + rightPage: ComparePageModel | null +): ComparePageResult { + if (leftPage && !rightPage) { + return { + status: 'left-only', + leftPageNumber: leftPage.pageNumber, + rightPageNumber: null, + changes: [ + { + id: 'page-removed', + type: 'page-removed', + description: `Page ${leftPage.pageNumber} exists only in the first PDF.`, + beforeText: leftPage.plainText.slice(0, 200), + afterText: '', + beforeRects: [], + afterRects: [], + }, + ], + summary: { added: 0, removed: 1, modified: 0 }, + visualDiff: null, + usedOcr: leftPage.source === 'ocr', + }; + } + + if (!leftPage && rightPage) { + return { + status: 'right-only', + leftPageNumber: null, + rightPageNumber: rightPage.pageNumber, + changes: [ + { + id: 'page-added', + type: 'page-added', + description: `Page ${rightPage.pageNumber} exists only in the second PDF.`, + beforeText: '', + afterText: rightPage.plainText.slice(0, 200), + beforeRects: [], + afterRects: [], + }, + ], + summary: { added: 1, removed: 0, modified: 0 }, + visualDiff: null, + usedOcr: rightPage.source === 'ocr', + }; + } + + if (!leftPage || !rightPage) { + return { + status: 'match', + leftPageNumber: null, + rightPageNumber: null, + changes: [], + summary: { added: 0, removed: 0, modified: 0 }, + visualDiff: null, + usedOcr: false, + }; + } + + const { changes, summary } = diffTextRuns( + leftPage.textItems, + rightPage.textItems + ); + + return { + status: changes.length > 0 ? 'changed' : 'match', + leftPageNumber: leftPage.pageNumber, + rightPageNumber: rightPage.pageNumber, + changes, + summary, + visualDiff: null, + usedOcr: leftPage.source === 'ocr' || rightPage.source === 'ocr', + }; +} diff --git a/src/js/compare/engine/diff-text-runs.ts b/src/js/compare/engine/diff-text-runs.ts new file mode 100644 index 0000000..af1e3ef --- /dev/null +++ b/src/js/compare/engine/diff-text-runs.ts @@ -0,0 +1,237 @@ +import { diffArrays } from 'diff'; + +import type { + CharPosition, + CompareChangeSummary, + CompareRectangle, + CompareTextChange, + CompareTextItem, + CompareWordToken, +} from '../types.ts'; + +interface WordToken { + word: string; + compareWord: string; + rect: CompareRectangle; +} + +function getCharMap(line: CompareTextItem): CharPosition[] { + if (line.charMap && line.charMap.length === line.normalizedText.length) { + return line.charMap; + } + const charWidth = line.rect.width / Math.max(line.normalizedText.length, 1); + return Array.from({ length: line.normalizedText.length }, (_, i) => ({ + x: line.rect.x + i * charWidth, + width: charWidth, + })); +} + +function splitLineIntoWords(line: CompareTextItem): WordToken[] { + if (line.wordTokens && line.wordTokens.length > 0) { + return line.wordTokens.map((token: CompareWordToken) => ({ + word: token.word, + compareWord: token.compareWord, + rect: token.rect, + })); + } + + const words = line.normalizedText.split(/\s+/).filter(Boolean); + if (words.length === 0) return []; + + const charMap = getCharMap(line); + let offset = 0; + + return words.map((word) => { + const startIndex = line.normalizedText.indexOf(word, offset); + const endIndex = startIndex + word.length - 1; + offset = startIndex + word.length; + + const startChar = charMap[startIndex]; + const endChar = charMap[endIndex]; + + if (!startChar || !endChar) { + const charWidth = + line.rect.width / Math.max(line.normalizedText.length, 1); + return { + word, + compareWord: word.toLowerCase(), + rect: { + x: line.rect.x + startIndex * charWidth, + y: line.rect.y, + width: word.length * charWidth, + height: line.rect.height, + }, + }; + } + + const x = startChar.x; + const w = endChar.x + endChar.width - startChar.x; + + return { + word, + compareWord: word.toLowerCase(), + rect: { x, y: line.rect.y, width: w, height: line.rect.height }, + }; + }); +} + +function groupAdjacentRects(rects: CompareRectangle[]): CompareRectangle[] { + if (rects.length === 0) return []; + + const sorted = [...rects].sort((a, b) => a.y - b.y || a.x - b.x); + const groups: CompareRectangle[][] = [[sorted[0]]]; + + for (let i = 1; i < sorted.length; i++) { + const prev = groups[groups.length - 1]; + const lastRect = prev[prev.length - 1]; + const curr = sorted[i]; + const sameLine = + Math.abs(curr.y - lastRect.y) < Math.max(lastRect.height * 0.6, 4); + const close = curr.x <= lastRect.x + lastRect.width + lastRect.height * 2; + + if (sameLine && close) { + prev.push(curr); + } else { + groups.push([curr]); + } + } + + return groups.map((group) => { + const minX = Math.min(...group.map((r) => r.x)); + const minY = Math.min(...group.map((r) => r.y)); + const maxX = Math.max(...group.map((r) => r.x + r.width)); + const maxY = Math.max(...group.map((r) => r.y + r.height)); + return { x: minX, y: minY, width: maxX - minX, height: maxY - minY }; + }); +} + +function collapseWords(words: WordToken[]) { + return words.map((word) => word.compareWord).join(''); +} + +function areEquivalentIgnoringWordBreaks( + beforeWords: WordToken[], + afterWords: WordToken[] +) { + if (beforeWords.length === 0 || afterWords.length === 0) { + return false; + } + + return collapseWords(beforeWords) === collapseWords(afterWords); +} + +function createWordChange( + changes: CompareTextChange[], + type: CompareTextChange['type'], + beforeWords: WordToken[], + afterWords: WordToken[] +) { + const beforeText = beforeWords.map((w) => w.word).join(' '); + const afterText = afterWords.map((w) => w.word).join(' '); + if (!beforeText && !afterText) return; + + const id = `${type}-${changes.length}`; + const beforeRects = groupAdjacentRects(beforeWords.map((w) => w.rect)); + const afterRects = groupAdjacentRects(afterWords.map((w) => w.rect)); + + if (type === 'modified') { + changes.push({ + id, + type, + description: `Replaced "${beforeText}" with "${afterText}"`, + beforeText, + afterText, + beforeRects, + afterRects, + }); + } else if (type === 'removed') { + changes.push({ + id, + type, + description: `Removed "${beforeText}"`, + beforeText, + afterText: '', + beforeRects, + afterRects: [], + }); + } else { + changes.push({ + id, + type, + description: `Added "${afterText}"`, + beforeText: '', + afterText, + beforeRects: [], + afterRects, + }); + } +} + +function toSummary(changes: CompareTextChange[]): CompareChangeSummary { + return changes.reduce( + (summary, change) => { + if (change.type === 'added') summary.added += 1; + if (change.type === 'removed') summary.removed += 1; + if (change.type === 'modified') summary.modified += 1; + return summary; + }, + { added: 0, removed: 0, modified: 0 } + ); +} + +export function diffTextRuns( + beforeItems: CompareTextItem[], + afterItems: CompareTextItem[] +) { + const beforeWords = beforeItems.flatMap(splitLineIntoWords); + const afterWords = afterItems.flatMap(splitLineIntoWords); + + const rawChanges = diffArrays( + beforeWords.map((w) => w.compareWord), + afterWords.map((w) => w.compareWord) + ); + + const changes: CompareTextChange[] = []; + let beforeIndex = 0; + let afterIndex = 0; + + for (let i = 0; i < rawChanges.length; i++) { + const change = rawChanges[i]; + const count = change.value.length; + + if (change.removed) { + const removedTokens = beforeWords.slice(beforeIndex, beforeIndex + count); + beforeIndex += count; + + const next = rawChanges[i + 1]; + if (next?.added) { + const addedTokens = afterWords.slice( + afterIndex, + afterIndex + next.value.length + ); + afterIndex += next.value.length; + if (areEquivalentIgnoringWordBreaks(removedTokens, addedTokens)) { + i++; + continue; + } + createWordChange(changes, 'modified', removedTokens, addedTokens); + i++; + } else { + createWordChange(changes, 'removed', removedTokens, []); + } + continue; + } + + if (change.added) { + const addedTokens = afterWords.slice(afterIndex, afterIndex + count); + afterIndex += count; + createWordChange(changes, 'added', [], addedTokens); + continue; + } + + beforeIndex += count; + afterIndex += count; + } + + return { changes, summary: toSummary(changes) }; +} diff --git a/src/js/compare/engine/extract-page-model.ts b/src/js/compare/engine/extract-page-model.ts new file mode 100644 index 0000000..7954398 --- /dev/null +++ b/src/js/compare/engine/extract-page-model.ts @@ -0,0 +1,520 @@ +import * as pdfjsLib from 'pdfjs-dist'; + +import type { + ComparePageModel, + CompareTextItem, + CharPosition, + CompareWordToken, +} from '../types.ts'; +import { + joinCompareTextItems, + normalizeCompareText, +} from './text-normalization.ts'; + +type PageTextItem = { + str: string; + width: number; + height: number; + transform: number[]; + dir: string; + fontName: string; + hasEOL: boolean; +}; + +type TextStyles = Record; + +const measurementCanvas = + typeof document !== 'undefined' ? document.createElement('canvas') : null; +const measurementContext = measurementCanvas + ? measurementCanvas.getContext('2d') + : null; +const textMeasurementCache: Map | null = measurementContext + ? new Map() + : null; +let lastMeasurementFont = ''; + +const DEFAULT_CHAR_WIDTH = 1; +const DEFAULT_SPACE_WIDTH = 0.33; + +function shouldJoinTokenWithPrevious(previous: string, current: string) { + if (!previous) return false; + if (/^[,.;:!?%)\]}]/.test(current)) return true; + if (/^[''"'’”]/u.test(current)) return true; + if (/[([{/"'“‘-]$/u.test(previous)) return true; + return false; +} + +function measureTextWidth(fontSpec: string, text: string): number { + if (!measurementContext) { + if (!text) return 0; + if (text === ' ') return DEFAULT_SPACE_WIDTH; + return text.length * DEFAULT_CHAR_WIDTH; + } + + if (lastMeasurementFont !== fontSpec) { + measurementContext.font = fontSpec; + lastMeasurementFont = fontSpec; + } + + const key = `${fontSpec}|${text}`; + const cached = textMeasurementCache?.get(key); + if (cached !== undefined) { + return cached; + } + + const width = measurementContext.measureText(text).width || 0; + textMeasurementCache?.set(key, width); + return width; +} + +function buildItemWordTokens( + viewport: pdfjsLib.PageViewport, + item: PageTextItem, + fallbackRect: CompareTextItem['rect'], + styles: TextStyles +): CompareWordToken[] { + const rawText = item.str || ''; + if (!rawText.trim()) { + return []; + } + + const totalLen = Math.max(rawText.length, 1); + const textStyle = item.fontName ? styles[item.fontName] : undefined; + const fontFamily = textStyle?.fontFamily ?? 'sans-serif'; + const fontScale = Math.max( + 0.5, + Math.hypot(item.transform[0], item.transform[1]) || 0 + ); + const fontSpec = `${fontScale}px ${fontFamily}`; + + const weights: number[] = new Array(totalLen); + let runningText = ''; + let previousAdvance = 0; + for (let index = 0; index < totalLen; index += 1) { + runningText += rawText[index]; + const advance = measureTextWidth(fontSpec, runningText); + let width = advance - previousAdvance; + if (!Number.isFinite(width) || width <= 0) { + width = rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH; + } + weights[index] = width; + previousAdvance = advance; + } + + if (!Number.isFinite(previousAdvance) || previousAdvance <= 0) { + for (let index = 0; index < totalLen; index += 1) { + weights[index] = + rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH; + } + } + + const prefix: number[] = new Array(totalLen + 1); + prefix[0] = 0; + for (let index = 0; index < totalLen; index += 1) { + prefix[index + 1] = prefix[index] + weights[index]; + } + const totalWeight = prefix[totalLen] || 1; + + const rawX = item.transform[4]; + const rawY = item.transform[5]; + const transformed = [ + viewport.convertToViewportPoint(rawX, rawY), + viewport.convertToViewportPoint(rawX + item.width, rawY), + viewport.convertToViewportPoint(rawX, rawY + item.height), + viewport.convertToViewportPoint(rawX + item.width, rawY + item.height), + ]; + const xs = transformed.map(([x]) => x); + const ys = transformed.map(([, y]) => y); + const left = Math.min(...xs); + const right = Math.max(...xs); + const top = Math.min(...ys); + const bottom = Math.max(...ys); + + const [baselineStart, baselineEnd, verticalEnd] = transformed; + const baselineVector: [number, number] = [ + baselineEnd[0] - baselineStart[0], + baselineEnd[1] - baselineStart[1], + ]; + const verticalVector: [number, number] = [ + verticalEnd[0] - baselineStart[0], + verticalEnd[1] - baselineStart[1], + ]; + const hasOrientationVectors = + Math.hypot(baselineVector[0], baselineVector[1]) > 1e-6 && + Math.hypot(verticalVector[0], verticalVector[1]) > 1e-6; + + const tokens: CompareWordToken[] = []; + const wordRegex = /\S+/gu; + let match: RegExpExecArray | null; + let previousEnd = 0; + + while ((match = wordRegex.exec(rawText)) !== null) { + const tokenText = match[0]; + const normalizedWord = normalizeCompareText(tokenText); + if (!normalizedWord) { + previousEnd = match.index + tokenText.length; + continue; + } + + const startIndex = match.index; + const endIndex = startIndex + tokenText.length; + const relStart = prefix[startIndex] / totalWeight; + const relEnd = prefix[endIndex] / totalWeight; + + let wordLeft: number; + let wordRight: number; + let wordTop: number; + let wordBottom: number; + + if (hasOrientationVectors) { + const segStart: [number, number] = [ + baselineStart[0] + baselineVector[0] * relStart, + baselineStart[1] + baselineVector[1] * relStart, + ]; + const segEnd: [number, number] = [ + baselineStart[0] + baselineVector[0] * relEnd, + baselineStart[1] + baselineVector[1] * relEnd, + ]; + const cornerPoints: Array<[number, number]> = [ + segStart, + [segStart[0] + verticalVector[0], segStart[1] + verticalVector[1]], + [segEnd[0] + verticalVector[0], segEnd[1] + verticalVector[1]], + segEnd, + ]; + wordLeft = Math.min(...cornerPoints.map(([x]) => x)); + wordRight = Math.max(...cornerPoints.map(([x]) => x)); + wordTop = Math.min(...cornerPoints.map(([, y]) => y)); + wordBottom = Math.max(...cornerPoints.map(([, y]) => y)); + } else { + const segLeft = left + (right - left) * relStart; + const segRight = left + (right - left) * relEnd; + wordLeft = Math.min(segLeft, segRight); + wordRight = Math.max(segLeft, segRight); + wordTop = top; + wordBottom = bottom; + } + + const width = Math.max(wordRight - wordLeft, 1); + const height = Math.max(wordBottom - wordTop, fallbackRect.height); + const gapText = rawText.slice(previousEnd, startIndex); + + const previousToken = tokens[tokens.length - 1]; + + tokens.push({ + word: normalizedWord, + compareWord: normalizedWord.toLowerCase(), + rect: { + x: Number.isFinite(wordLeft) ? wordLeft : fallbackRect.x, + y: Number.isFinite(wordTop) ? wordTop : fallbackRect.y, + width, + height, + }, + joinsWithPrevious: + (gapText.length > 0 && !/\s/u.test(gapText)) || + (previousToken + ? shouldJoinTokenWithPrevious(previousToken.word, normalizedWord) + : false), + }); + + previousEnd = endIndex; + } + + return tokens; +} + +function toRect( + viewport: pdfjsLib.PageViewport, + item: PageTextItem, + index: number, + styles: TextStyles +) { + const normalizedText = normalizeCompareText(item.str); + + const transformed = pdfjsLib.Util.transform( + viewport.transform, + item.transform + ); + const width = Math.max(item.width * viewport.scale, 1); + const height = Math.max( + Math.abs(transformed[3]) || item.height * viewport.scale, + 1 + ); + const x = transformed[4]; + const y = transformed[5] - height; + + const rect = { + x, + y, + width, + height, + }; + + return { + id: `${index}-${normalizedText}`, + text: item.str, + normalizedText, + rect, + wordTokens: buildItemWordTokens(viewport, item, rect, styles), + } satisfies CompareTextItem; +} + +export function sortCompareTextItems(items: CompareTextItem[]) { + return [...items].sort((left, right) => { + const lineTolerance = Math.max( + Math.min(left.rect.height, right.rect.height) * 0.6, + 4 + ); + const topDiff = left.rect.y - right.rect.y; + + if (Math.abs(topDiff) > lineTolerance) { + return topDiff; + } + + const xDiff = left.rect.x - right.rect.x; + if (Math.abs(xDiff) > 1) { + return xDiff; + } + + return left.id.localeCompare(right.id); + }); +} + +function averageCharacterWidth(item: CompareTextItem) { + const compactText = item.normalizedText.replace(/\s+/g, ''); + return item.rect.width / Math.max(compactText.length, 1); +} + +function shouldInsertSpaceBetweenItems( + left: CompareTextItem, + right: CompareTextItem +) { + if (!left.normalizedText || !right.normalizedText) { + return false; + } + + if (/^[,.;:!?%)\]}]/.test(right.normalizedText)) { + return false; + } + + if (/^[''"'’”]/u.test(right.normalizedText)) { + return false; + } + + if (/[([{/"'“‘-]$/u.test(left.normalizedText)) { + return false; + } + + const gap = right.rect.x - (left.rect.x + left.rect.width); + if (gap <= 0) { + return false; + } + + const leftWidth = averageCharacterWidth(left); + const rightWidth = averageCharacterWidth(right); + const threshold = Math.max(Math.min(leftWidth, rightWidth) * 0.45, 1.5); + + return gap >= threshold; +} + +function mergeLineText(lineItems: CompareTextItem[]): { + text: string; + charMap: CharPosition[]; +} { + if (lineItems.length === 0) { + return { text: '', charMap: [] }; + } + + const charMap: CharPosition[] = []; + + function pushFragChars(frag: CompareTextItem) { + const fragText = frag.normalizedText; + const fragCharWidth = frag.rect.width / Math.max(fragText.length, 1); + for (let ci = 0; ci < fragText.length; ci++) { + charMap.push({ + x: frag.rect.x + ci * fragCharWidth, + width: fragCharWidth, + }); + } + } + + let merged = lineItems[0].normalizedText; + pushFragChars(lineItems[0]); + + for (let index = 1; index < lineItems.length; index += 1) { + const previous = lineItems[index - 1]; + const current = lineItems[index]; + + if (shouldInsertSpaceBetweenItems(previous, current)) { + const gap = current.rect.x - (previous.rect.x + previous.rect.width); + charMap.push({ + x: previous.rect.x + previous.rect.width, + width: Math.max(gap, 1), + }); + merged += ` ${current.normalizedText}`; + } else { + merged += current.normalizedText; + } + pushFragChars(current); + } + + return { text: normalizeCompareText(merged), charMap }; +} + +function mergeWordTokenRects( + left: CompareWordToken, + right: CompareWordToken +): CompareWordToken { + const minX = Math.min(left.rect.x, right.rect.x); + const minY = Math.min(left.rect.y, right.rect.y); + const maxX = Math.max( + left.rect.x + left.rect.width, + right.rect.x + right.rect.width + ); + const maxY = Math.max( + left.rect.y + left.rect.height, + right.rect.y + right.rect.height + ); + + return { + word: `${left.word}${right.word}`, + compareWord: `${left.compareWord}${right.compareWord}`, + rect: { + x: minX, + y: minY, + width: maxX - minX, + height: maxY - minY, + }, + }; +} + +function buildMergedWordTokens(lineItems: CompareTextItem[]) { + if ( + !lineItems.some((item) => item.wordTokens && item.wordTokens.length > 0) + ) { + return undefined; + } + + const mergedTokens: CompareWordToken[] = []; + let previousItem: CompareTextItem | null = null; + + for (const item of lineItems) { + const itemTokens = + item.wordTokens && item.wordTokens.length > 0 + ? item.wordTokens + : [ + { + word: item.normalizedText, + compareWord: item.normalizedText.toLowerCase(), + rect: item.rect, + } satisfies CompareWordToken, + ]; + + itemTokens.forEach((token, tokenIndex) => { + const joinsAcrossItems = + tokenIndex === 0 && previousItem + ? !shouldInsertSpaceBetweenItems(previousItem, item) + : false; + const shouldJoin = + mergedTokens.length > 0 && + (tokenIndex > 0 ? Boolean(token.joinsWithPrevious) : joinsAcrossItems); + + if (shouldJoin) { + mergedTokens[mergedTokens.length - 1] = mergeWordTokenRects( + mergedTokens[mergedTokens.length - 1], + token + ); + } else { + mergedTokens.push({ + word: token.word, + compareWord: token.compareWord, + rect: token.rect, + }); + } + }); + + previousItem = item; + } + + return mergedTokens; +} + +export function mergeIntoLines( + sortedItems: CompareTextItem[] +): CompareTextItem[] { + if (sortedItems.length === 0) return []; + + const lines: CompareTextItem[][] = []; + let currentLine: CompareTextItem[] = [sortedItems[0]]; + + for (let i = 1; i < sortedItems.length; i++) { + const anchor = currentLine[0]; + const curr = sortedItems[i]; + const lineTolerance = Math.max( + Math.min(anchor.rect.height, curr.rect.height) * 0.6, + 4 + ); + + if (Math.abs(curr.rect.y - anchor.rect.y) <= lineTolerance) { + currentLine.push(curr); + } else { + lines.push(currentLine); + currentLine = [curr]; + } + } + lines.push(currentLine); + + return lines.map((lineItems, lineIndex) => { + const { text: normalizedText, charMap } = mergeLineText(lineItems); + + const minX = Math.min(...lineItems.map((item) => item.rect.x)); + const minY = Math.min(...lineItems.map((item) => item.rect.y)); + const maxX = Math.max( + ...lineItems.map((item) => item.rect.x + item.rect.width) + ); + const maxY = Math.max( + ...lineItems.map((item) => item.rect.y + item.rect.height) + ); + + return { + id: `line-${lineIndex}`, + text: lineItems.map((item) => item.text).join(' '), + normalizedText, + rect: { + x: minX, + y: minY, + width: maxX - minX, + height: maxY - minY, + }, + fragments: lineItems, + charMap, + wordTokens: buildMergedWordTokens(lineItems), + }; + }); +} + +export async function extractPageModel( + page: pdfjsLib.PDFPageProxy, + viewport: pdfjsLib.PageViewport +): Promise { + const textContent = await page.getTextContent({ + disableCombineTextItems: true, + }); + const styles = textContent.styles ?? {}; + const rawItems = sortCompareTextItems( + textContent.items + .filter((item): item is PageTextItem => 'str' in item) + .map((item, index) => toRect(viewport, item, index, styles)) + .filter((item) => item.normalizedText.length > 0) + ); + const textItems = mergeIntoLines(rawItems); + + return { + pageNumber: page.pageNumber, + width: viewport.width, + height: viewport.height, + textItems, + plainText: joinCompareTextItems(textItems), + hasText: textItems.length > 0, + source: 'pdfjs', + }; +} diff --git a/src/js/compare/engine/ocr-page.ts b/src/js/compare/engine/ocr-page.ts new file mode 100644 index 0000000..5c229c7 --- /dev/null +++ b/src/js/compare/engine/ocr-page.ts @@ -0,0 +1,76 @@ +import Tesseract from 'tesseract.js'; + +import type { ComparePageModel, CompareTextItem } from '../types.ts'; +import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts'; +import { + joinCompareTextItems, + normalizeCompareText, +} from './text-normalization.ts'; + +type OcrWord = { + text: string; + bbox: { + x0: number; + y0: number; + x1: number; + y1: number; + }; +}; + +export async function recognizePageCanvas( + canvas: HTMLCanvasElement, + language: string, + onProgress?: (status: string, progress: number) => void +): Promise { + const result = await Tesseract.recognize(canvas, language, { + logger(message) { + onProgress?.(message.status, message.progress || 0); + }, + }); + + const ocrData = result.data as unknown as { words?: OcrWord[] }; + const words = ((ocrData.words || []) as OcrWord[]) + .map((word, index) => { + const normalizedText = normalizeCompareText(word.text || ''); + if (!normalizedText) return null; + + const item: CompareTextItem = { + id: `ocr-${index}-${normalizedText}`, + text: word.text, + normalizedText, + rect: { + x: word.bbox.x0, + y: word.bbox.y0, + width: Math.max(word.bbox.x1 - word.bbox.x0, 1), + height: Math.max(word.bbox.y1 - word.bbox.y0, 1), + }, + wordTokens: [ + { + word: normalizedText, + compareWord: normalizedText.toLowerCase(), + rect: { + x: word.bbox.x0, + y: word.bbox.y0, + width: Math.max(word.bbox.x1 - word.bbox.x0, 1), + height: Math.max(word.bbox.y1 - word.bbox.y0, 1), + }, + }, + ], + }; + + return item; + }) + .filter((word): word is CompareTextItem => Boolean(word)); + + const mergedItems = mergeIntoLines(sortCompareTextItems(words)); + + return { + pageNumber: 0, + width: canvas.width, + height: canvas.height, + textItems: mergedItems, + plainText: joinCompareTextItems(mergedItems), + hasText: mergedItems.length > 0, + source: 'ocr', + }; +} diff --git a/src/js/compare/engine/page-signatures.ts b/src/js/compare/engine/page-signatures.ts new file mode 100644 index 0000000..81cc4c4 --- /dev/null +++ b/src/js/compare/engine/page-signatures.ts @@ -0,0 +1,61 @@ +import * as pdfjsLib from 'pdfjs-dist'; + +import type { ComparePageSignature, CompareTextItem } from '../types.ts'; +import { + joinNormalizedText, + normalizeCompareText, +} from './text-normalization.ts'; + +type SignatureTextItem = { + str: string; + dir: string; + transform: number[]; + width: number; + height: number; + fontName: string; + hasEOL: boolean; +}; + +function tokenToItem(token: string, index: number): CompareTextItem { + return { + id: `token-${index}-${token}`, + text: token, + normalizedText: token, + rect: { x: 0, y: 0, width: 0, height: 0 }, + }; +} + +export async function extractPageSignature( + pdfDoc: pdfjsLib.PDFDocumentProxy, + pageNumber: number +): Promise { + const page = await pdfDoc.getPage(pageNumber); + const textContent = await page.getTextContent(); + const tokens = textContent.items + .filter((item): item is SignatureTextItem => 'str' in item) + .map((item) => normalizeCompareText(item.str)) + .filter(Boolean); + + const limitedTokens = tokens.slice(0, 500); + + return { + pageNumber, + plainText: joinNormalizedText(limitedTokens), + hasText: limitedTokens.length > 0, + tokenItems: limitedTokens.map((token, index) => tokenToItem(token, index)), + }; +} + +export async function extractDocumentSignatures( + pdfDoc: pdfjsLib.PDFDocumentProxy, + onProgress?: (pageNumber: number, totalPages: number) => void +) { + const signatures: ComparePageSignature[] = []; + + for (let pageNumber = 1; pageNumber <= pdfDoc.numPages; pageNumber += 1) { + onProgress?.(pageNumber, pdfDoc.numPages); + signatures.push(await extractPageSignature(pdfDoc, pageNumber)); + } + + return signatures; +} diff --git a/src/js/compare/engine/pair-pages.ts b/src/js/compare/engine/pair-pages.ts new file mode 100644 index 0000000..d9621c2 --- /dev/null +++ b/src/js/compare/engine/pair-pages.ts @@ -0,0 +1,122 @@ +import type { ComparePagePair, ComparePageSignature } from '../types.ts'; + +function tokenize(text: string) { + return new Set(text.split(/\s+/).filter(Boolean)); +} + +function similarityScore( + left: ComparePageSignature, + right: ComparePageSignature +) { + if (!left.hasText && !right.hasText) { + return left.pageNumber === right.pageNumber ? 0.7 : 0.35; + } + + if (!left.hasText || !right.hasText) { + return 0.08; + } + + const leftTokens = tokenize(left.plainText); + const rightTokens = tokenize(right.plainText); + const union = new Set([...leftTokens, ...rightTokens]); + let intersectionCount = 0; + + leftTokens.forEach((token) => { + if (rightTokens.has(token)) intersectionCount += 1; + }); + + const jaccard = union.size === 0 ? 0 : intersectionCount / union.size; + const positionalBias = left.pageNumber === right.pageNumber ? 0.1 : 0; + return Math.min(jaccard + positionalBias, 1); +} + +export function pairPages( + leftPages: ComparePageSignature[], + rightPages: ComparePageSignature[] +) { + const insertionCost = 0.8; + const rowCount = leftPages.length + 1; + const colCount = rightPages.length + 1; + const dp = Array.from({ length: rowCount }, () => + Array(colCount).fill(0) + ); + const backtrack = Array.from({ length: rowCount }, () => + Array<'match' | 'left' | 'right'>(colCount).fill('match') + ); + + for (let i = 1; i < rowCount; i += 1) { + dp[i][0] = i * insertionCost; + backtrack[i][0] = 'left'; + } + + for (let j = 1; j < colCount; j += 1) { + dp[0][j] = j * insertionCost; + backtrack[0][j] = 'right'; + } + + for (let i = 1; i < rowCount; i += 1) { + for (let j = 1; j < colCount; j += 1) { + const similarity = similarityScore(leftPages[i - 1], rightPages[j - 1]); + const matchCost = dp[i - 1][j - 1] + (1 - similarity); + const leftCost = dp[i - 1][j] + insertionCost; + const rightCost = dp[i][j - 1] + insertionCost; + + const minCost = Math.min(matchCost, leftCost, rightCost); + dp[i][j] = minCost; + + if (minCost === matchCost) { + backtrack[i][j] = 'match'; + } else if (minCost === leftCost) { + backtrack[i][j] = 'left'; + } else { + backtrack[i][j] = 'right'; + } + } + } + + const pairs: ComparePagePair[] = []; + let i = leftPages.length; + let j = rightPages.length; + + while (i > 0 || j > 0) { + const direction = backtrack[i][j]; + + if (i > 0 && j > 0 && direction === 'match') { + const confidence = similarityScore(leftPages[i - 1], rightPages[j - 1]); + pairs.push({ + pairIndex: 0, + leftPageNumber: leftPages[i - 1].pageNumber, + rightPageNumber: rightPages[j - 1].pageNumber, + confidence, + }); + i -= 1; + j -= 1; + continue; + } + + if (i > 0 && (j === 0 || direction === 'left')) { + pairs.push({ + pairIndex: 0, + leftPageNumber: leftPages[i - 1].pageNumber, + rightPageNumber: null, + confidence: 0, + }); + i -= 1; + continue; + } + + if (j > 0) { + pairs.push({ + pairIndex: 0, + leftPageNumber: null, + rightPageNumber: rightPages[j - 1].pageNumber, + confidence: 0, + }); + j -= 1; + } + } + + return pairs + .reverse() + .map((pair, index) => ({ ...pair, pairIndex: index + 1 })); +} diff --git a/src/js/compare/engine/text-normalization.ts b/src/js/compare/engine/text-normalization.ts new file mode 100644 index 0000000..1166dd1 --- /dev/null +++ b/src/js/compare/engine/text-normalization.ts @@ -0,0 +1,64 @@ +import type { CompareTextItem } from '../types.ts'; + +export function normalizeCompareText(text: string) { + return text + .normalize('NFKC') + .replace(/[\u0000-\u001F\u007F-\u009F]/g, ' ') + .replace(/[\u{E000}-\u{F8FF}]/gu, ' ') + .replace(/\s+/g, ' ') + .trim(); +} + +function shouldAppendWithoutSpace(current: string, next: string) { + if (!current) return true; + if (/^[,.;:!?%)\]}]/.test(next)) return true; + if (/^["']$/.test(next)) return true; + if (/^['’”]/u.test(next)) return true; + if (/[([{/"'“‘-]$/u.test(current)) return true; + return false; +} + +export function joinNormalizedText(tokens: string[]) { + return tokens.reduce((result, token) => { + if (!token) return result; + if (shouldAppendWithoutSpace(result, token)) { + return `${result}${token}`; + } + return `${result} ${token}`; + }, ''); +} + +export function joinCompareTextItems(items: CompareTextItem[]) { + return joinNormalizedText(items.map((item) => item.normalizedText)); +} + +export function isLowQualityExtractedText(text: string) { + const normalized = normalizeCompareText(text); + if (!normalized) return true; + + const tokens = normalized.split(/\s+/).filter(Boolean); + const visibleCharacters = Array.from(normalized).filter( + (character) => character.trim().length > 0 + ); + const alphaNumericCount = visibleCharacters.filter((character) => + /[\p{L}\p{N}]/u.test(character) + ).length; + const symbolCount = visibleCharacters.length - alphaNumericCount; + const tokenWithAlphaNumericCount = tokens.filter((token) => + /[\p{L}\p{N}]/u.test(token) + ).length; + + if (alphaNumericCount === 0) return true; + if ( + visibleCharacters.length >= 12 && + alphaNumericCount / visibleCharacters.length < 0.45 && + symbolCount / visibleCharacters.length > 0.35 + ) { + return true; + } + if (tokens.length >= 6 && tokenWithAlphaNumericCount / tokens.length < 0.6) { + return true; + } + + return false; +} diff --git a/src/js/compare/engine/visual-diff.ts b/src/js/compare/engine/visual-diff.ts new file mode 100644 index 0000000..b8e891c --- /dev/null +++ b/src/js/compare/engine/visual-diff.ts @@ -0,0 +1,134 @@ +import pixelmatch from 'pixelmatch'; + +import type { CompareVisualDiff } from '../types.ts'; + +type FocusRegion = { + x: number; + y: number; + width: number; + height: number; +}; + +function createCanvas(width: number, height: number) { + const canvas = document.createElement('canvas'); + canvas.width = width; + canvas.height = height; + return canvas; +} + +function drawNormalized( + sourceCanvas: HTMLCanvasElement, + targetCanvas: HTMLCanvasElement +) { + const context = targetCanvas.getContext('2d'); + if (!context) { + throw new Error('Could not create comparison canvas context.'); + } + + context.fillStyle = '#ffffff'; + context.fillRect(0, 0, targetCanvas.width, targetCanvas.height); + + const offsetX = Math.floor((targetCanvas.width - sourceCanvas.width) / 2); + const offsetY = Math.floor((targetCanvas.height - sourceCanvas.height) / 2); + context.drawImage(sourceCanvas, offsetX, offsetY); +} + +export function renderVisualDiff( + canvas1: HTMLCanvasElement, + canvas2: HTMLCanvasElement, + outputCanvas: HTMLCanvasElement, + focusRegion?: FocusRegion +): CompareVisualDiff { + const width = Math.max(canvas1.width, canvas2.width, 1); + const height = Math.max(canvas1.height, canvas2.height, 1); + const normalizedCanvas1 = createCanvas(width, height); + const normalizedCanvas2 = createCanvas(width, height); + + drawNormalized(canvas1, normalizedCanvas1); + drawNormalized(canvas2, normalizedCanvas2); + + outputCanvas.width = width; + outputCanvas.height = height; + + const context1 = normalizedCanvas1.getContext('2d'); + const context2 = normalizedCanvas2.getContext('2d'); + const outputContext = outputCanvas.getContext('2d'); + + if (!context1 || !context2 || !outputContext) { + throw new Error('Could not create visual diff context.'); + } + + const image1 = context1.getImageData(0, 0, width, height); + const image2 = context2.getImageData(0, 0, width, height); + const diffImage = outputContext.createImageData(width, height); + + const mismatchPixels = pixelmatch( + image1.data, + image2.data, + diffImage.data, + width, + height, + { + threshold: 0.12, + includeAA: false, + alpha: 0.2, + diffMask: false, + diffColor: [239, 68, 68], + diffColorAlt: [34, 197, 94], + } + ); + + const overlayCanvas = createCanvas(width, height); + const overlayContext = overlayCanvas.getContext('2d'); + + if (!overlayContext) { + throw new Error('Could not create visual diff overlay context.'); + } + + overlayContext.putImageData(diffImage, 0, 0); + + const region = focusRegion + ? { + x: Math.max(Math.floor(focusRegion.x), 0), + y: Math.max(Math.floor(focusRegion.y), 0), + width: Math.min(Math.ceil(focusRegion.width), width), + height: Math.min(Math.ceil(focusRegion.height), height), + } + : { x: 0, y: 0, width, height }; + + outputCanvas.width = Math.max(region.width, 1); + outputCanvas.height = Math.max(region.height, 1); + + outputContext.fillStyle = '#ffffff'; + outputContext.fillRect(0, 0, outputCanvas.width, outputCanvas.height); + outputContext.drawImage( + normalizedCanvas2, + region.x, + region.y, + region.width, + region.height, + 0, + 0, + outputCanvas.width, + outputCanvas.height + ); + outputContext.globalAlpha = 0.9; + outputContext.drawImage( + overlayCanvas, + region.x, + region.y, + region.width, + region.height, + 0, + 0, + outputCanvas.width, + outputCanvas.height + ); + outputContext.globalAlpha = 1; + + return { + mismatchPixels, + mismatchRatio: mismatchPixels / Math.max(width * height, 1), + hasDiff: mismatchPixels > 0, + }; +} diff --git a/src/js/compare/reporting/build-report.ts b/src/js/compare/reporting/build-report.ts new file mode 100644 index 0000000..ad7b9e1 --- /dev/null +++ b/src/js/compare/reporting/build-report.ts @@ -0,0 +1,77 @@ +import type { ComparePagePair, ComparePageResult } from '../types.ts'; + +function escapeHtml(text: string) { + return text + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); +} + +export function buildCompareReport( + fileName1: string, + fileName2: string, + pairs: ComparePagePair[], + results: ComparePageResult[] +) { + const totals = results.reduce( + (summary, result) => { + summary.added += result.summary.added; + summary.removed += result.summary.removed; + summary.modified += result.summary.modified; + return summary; + }, + { added: 0, removed: 0, modified: 0 } + ); + + const rows = results + .map((result, index) => { + const pair = pairs[index]; + const changes = result.changes + .map( + (change) => + `
  • ${escapeHtml(change.type)}: ${escapeHtml(change.description)}
  • ` + ) + .join(''); + + return ` +
    +

    Comparison ${pair?.pairIndex || index + 1}

    +

    PDF 1 page: ${pair?.leftPageNumber ?? 'none'} | PDF 2 page: ${pair?.rightPageNumber ?? 'none'} | Confidence: ${((pair?.confidence || 0) * 100).toFixed(0)}%

    +

    Status: ${escapeHtml(result.status)}${result.usedOcr ? ' | OCR used' : ''}

    +

    Added: ${result.summary.added} | Removed: ${result.summary.removed} | Modified: ${result.summary.modified}

    +
      ${changes || '
    • No semantic changes detected.
    • '}
    +
    + `; + }) + .join(''); + + return ` + + + + + Compare report + + + +

    PDF Compare Report

    +

    PDF 1: ${escapeHtml(fileName1)} | PDF 2: ${escapeHtml(fileName2)}

    +
    +
    Added
    ${totals.added}
    +
    Removed
    ${totals.removed}
    +
    Modified
    ${totals.modified}
    +
    + ${rows} + +`; +} diff --git a/src/js/compare/reporting/export-html-report.ts b/src/js/compare/reporting/export-html-report.ts new file mode 100644 index 0000000..7e3a4a4 --- /dev/null +++ b/src/js/compare/reporting/export-html-report.ts @@ -0,0 +1,18 @@ +import { buildCompareReport } from './build-report.ts'; +import type { ComparePagePair, ComparePageResult } from '../types.ts'; + +export function exportCompareHtmlReport( + fileName1: string, + fileName2: string, + pairs: ComparePagePair[], + results: ComparePageResult[] +) { + const html = buildCompareReport(fileName1, fileName2, pairs, results); + const blob = new Blob([html], { type: 'text/html;charset=utf-8' }); + const url = URL.createObjectURL(blob); + const anchor = document.createElement('a'); + anchor.href = url; + anchor.download = 'bentopdf-compare-report.html'; + anchor.click(); + URL.revokeObjectURL(url); +} diff --git a/src/js/compare/types.ts b/src/js/compare/types.ts new file mode 100644 index 0000000..9996609 --- /dev/null +++ b/src/js/compare/types.ts @@ -0,0 +1,113 @@ +import type * as pdfjsLib from 'pdfjs-dist'; + +export type CompareViewMode = 'overlay' | 'side-by-side'; + +export interface CompareRectangle { + x: number; + y: number; + width: number; + height: number; +} + +export interface CharPosition { + x: number; + width: number; +} + +export interface CompareWordToken { + word: string; + compareWord: string; + rect: CompareRectangle; + joinsWithPrevious?: boolean; +} + +export interface CompareTextItem { + id: string; + text: string; + normalizedText: string; + rect: CompareRectangle; + fragments?: CompareTextItem[]; + charMap?: CharPosition[]; + wordTokens?: CompareWordToken[]; +} + +export interface ComparePageModel { + pageNumber: number; + width: number; + height: number; + textItems: CompareTextItem[]; + plainText: string; + hasText: boolean; + source: 'pdfjs' | 'ocr'; +} + +export interface ComparePageSignature { + pageNumber: number; + plainText: string; + hasText: boolean; + tokenItems: CompareTextItem[]; +} + +export interface ComparePagePair { + pairIndex: number; + leftPageNumber: number | null; + rightPageNumber: number | null; + confidence: number; +} + +export interface CompareVisualDiff { + mismatchPixels: number; + mismatchRatio: number; + hasDiff: boolean; +} + +export type CompareChangeType = + | 'added' + | 'removed' + | 'modified' + | 'page-added' + | 'page-removed'; + +export interface CompareTextChange { + id: string; + type: CompareChangeType; + description: string; + beforeText: string; + afterText: string; + beforeRects: CompareRectangle[]; + afterRects: CompareRectangle[]; +} + +export interface CompareChangeSummary { + added: number; + removed: number; + modified: number; +} + +export interface ComparePageResult { + status: 'match' | 'changed' | 'left-only' | 'right-only'; + leftPageNumber: number | null; + rightPageNumber: number | null; + changes: CompareTextChange[]; + summary: CompareChangeSummary; + visualDiff: CompareVisualDiff | null; + confidence?: number; + usedOcr?: boolean; +} + +export type CompareFilterType = 'added' | 'removed' | 'modified' | 'all'; + +export interface CompareState { + pdfDoc1: pdfjsLib.PDFDocumentProxy | null; + pdfDoc2: pdfjsLib.PDFDocumentProxy | null; + currentPage: number; + viewMode: CompareViewMode; + isSyncScroll: boolean; + currentComparison: ComparePageResult | null; + activeChangeIndex: number; + pagePairs: ComparePagePair[]; + activeFilter: CompareFilterType; + changeSearchQuery: string; + useOcr: boolean; + ocrLanguage: string; +} diff --git a/src/js/logic/compare-pdfs-page.ts b/src/js/logic/compare-pdfs-page.ts index 93cd7dc..5caf8d7 100644 --- a/src/js/logic/compare-pdfs-page.ts +++ b/src/js/logic/compare-pdfs-page.ts @@ -3,297 +3,1086 @@ import { getPDFDocument } from '../utils/helpers.js'; import { icons, createIcons } from 'lucide'; import * as pdfjsLib from 'pdfjs-dist'; import { CompareState } from '@/types'; +import type { + CompareFilterType, + ComparePageModel, + ComparePagePair, + ComparePageResult, + CompareTextChange, +} from '../compare/types.ts'; +import { extractPageModel } from '../compare/engine/extract-page-model.ts'; +import { comparePageModels } from '../compare/engine/compare-page-models.ts'; +import { renderVisualDiff } from '../compare/engine/visual-diff.ts'; +import { extractDocumentSignatures } from '../compare/engine/page-signatures.ts'; +import { pairPages } from '../compare/engine/pair-pages.ts'; +import { recognizePageCanvas } from '../compare/engine/ocr-page.ts'; +import { exportCompareHtmlReport } from '../compare/reporting/export-html-report.ts'; +import { isLowQualityExtractedText } from '../compare/engine/text-normalization.ts'; -pdfjsLib.GlobalWorkerOptions.workerSrc = new URL('pdfjs-dist/build/pdf.worker.min.mjs', import.meta.url).toString(); +pdfjsLib.GlobalWorkerOptions.workerSrc = new URL( + 'pdfjs-dist/build/pdf.worker.min.mjs', + import.meta.url +).toString(); const pageState: CompareState = { - pdfDoc1: null, - pdfDoc2: null, - currentPage: 1, - viewMode: 'overlay', - isSyncScroll: true, + pdfDoc1: null, + pdfDoc2: null, + currentPage: 1, + viewMode: 'side-by-side', + isSyncScroll: true, + currentComparison: null, + activeChangeIndex: 0, + pagePairs: [], + activeFilter: 'all', + changeSearchQuery: '', + useOcr: true, + ocrLanguage: 'eng', }; -async function renderPage( - pdfDoc: pdfjsLib.PDFDocumentProxy, - pageNum: number, - canvas: HTMLCanvasElement, - container: HTMLElement +const pageModelCache = new Map(); +const comparisonCache = new Map(); +const comparisonResultsCache = new Map(); +const documentNames = { + left: 'first.pdf', + right: 'second.pdf', +}; + +type RenderedPage = { + model: ComparePageModel | null; + exists: boolean; +}; + +type ComparisonPageLoad = { + model: ComparePageModel | null; + exists: boolean; +}; + +type DiffFocusRegion = { + x: number; + y: number; + width: number; + height: number; +}; + +function getElement(id: string) { + return document.getElementById(id) as T | null; +} + +function clearCanvas(canvas: HTMLCanvasElement) { + const context = canvas.getContext('2d'); + canvas.width = 1; + canvas.height = 1; + context?.clearRect(0, 0, 1, 1); +} + +function renderMissingPage( + canvas: HTMLCanvasElement, + placeholderId: string, + message: string ) { - const page = await pdfDoc.getPage(pageNum); + clearCanvas(canvas); + const placeholder = getElement(placeholderId); + if (placeholder) { + placeholder.textContent = message; + placeholder.classList.remove('hidden'); + } +} - const containerWidth = container.clientWidth - 2; - const viewport = page.getViewport({ scale: 1.0 }); - const scale = containerWidth / viewport.width; - const scaledViewport = page.getViewport({ scale: scale }); +function hidePlaceholder(placeholderId: string) { + const placeholder = getElement(placeholderId); + placeholder?.classList.add('hidden'); +} - canvas.width = scaledViewport.width; - canvas.height = scaledViewport.height; +function getRenderScale(page: pdfjsLib.PDFPageProxy, container: HTMLElement) { + const baseViewport = page.getViewport({ scale: 1.0 }); + const availableWidth = Math.max( + container.clientWidth - (pageState.viewMode === 'overlay' ? 96 : 56), + 320 + ); + const fitScale = availableWidth / Math.max(baseViewport.width, 1); + const maxScale = pageState.viewMode === 'overlay' ? 2.5 : 2.0; - await page.render({ - canvasContext: canvas.getContext('2d')!, - viewport: scaledViewport, - canvas - }).promise; + return Math.min(Math.max(fitScale, 1.0), maxScale); +} + +function getPageModelCacheKey( + cacheKeyPrefix: 'left' | 'right', + pageNum: number, + scale: number +) { + return `${cacheKeyPrefix}-${pageNum}-${scale.toFixed(3)}`; +} + +function shouldUseOcrForModel(model: ComparePageModel) { + return !model.hasText || isLowQualityExtractedText(model.plainText); +} + +function buildDiffFocusRegion( + comparison: ComparePageResult, + leftCanvas: HTMLCanvasElement, + rightCanvas: HTMLCanvasElement +): DiffFocusRegion | undefined { + const leftOffsetX = Math.floor( + (Math.max(leftCanvas.width, rightCanvas.width) - leftCanvas.width) / 2 + ); + const leftOffsetY = Math.floor( + (Math.max(leftCanvas.height, rightCanvas.height) - leftCanvas.height) / 2 + ); + const rightOffsetX = Math.floor( + (Math.max(leftCanvas.width, rightCanvas.width) - rightCanvas.width) / 2 + ); + const rightOffsetY = Math.floor( + (Math.max(leftCanvas.height, rightCanvas.height) - rightCanvas.height) / 2 + ); + const bounds = { + minX: Infinity, + minY: Infinity, + maxX: -Infinity, + maxY: -Infinity, + }; + + for (const change of comparison.changes) { + for (const rect of change.beforeRects) { + bounds.minX = Math.min(bounds.minX, rect.x + leftOffsetX); + bounds.minY = Math.min(bounds.minY, rect.y + leftOffsetY); + bounds.maxX = Math.max(bounds.maxX, rect.x + leftOffsetX + rect.width); + bounds.maxY = Math.max(bounds.maxY, rect.y + leftOffsetY + rect.height); + } + + for (const rect of change.afterRects) { + bounds.minX = Math.min(bounds.minX, rect.x + rightOffsetX); + bounds.minY = Math.min(bounds.minY, rect.y + rightOffsetY); + bounds.maxX = Math.max(bounds.maxX, rect.x + rightOffsetX + rect.width); + bounds.maxY = Math.max(bounds.maxY, rect.y + rightOffsetY + rect.height); + } + } + + if (!Number.isFinite(bounds.minX)) { + return undefined; + } + + const fullWidth = Math.max(leftCanvas.width, rightCanvas.width, 1); + const fullHeight = Math.max(leftCanvas.height, rightCanvas.height, 1); + const padding = 40; + + const x = Math.max(Math.floor(bounds.minX - padding), 0); + const y = Math.max(Math.floor(bounds.minY - padding), 0); + const maxX = Math.min(Math.ceil(bounds.maxX + padding), fullWidth); + const maxY = Math.min(Math.ceil(bounds.maxY + padding), fullHeight); + + return { + x, + y, + width: Math.max(maxX - x, Math.min(320, fullWidth)), + height: Math.max(maxY - y, Math.min(200, fullHeight)), + }; +} + +async function renderPage( + pdfDoc: pdfjsLib.PDFDocumentProxy, + pageNum: number, + canvas: HTMLCanvasElement, + container: HTMLElement, + placeholderId: string, + cacheKeyPrefix: 'left' | 'right' +): Promise { + if (pageNum > pdfDoc.numPages) { + renderMissingPage( + canvas, + placeholderId, + `Page ${pageNum} does not exist in this PDF.` + ); + return { model: null, exists: false }; + } + + const page = await pdfDoc.getPage(pageNum); + + const targetScale = getRenderScale(page, container); + const scaledViewport = page.getViewport({ scale: targetScale }); + const dpr = window.devicePixelRatio || 1; + const hiResViewport = page.getViewport({ scale: targetScale * dpr }); + + hidePlaceholder(placeholderId); + + canvas.width = hiResViewport.width; + canvas.height = hiResViewport.height; + canvas.style.width = `${scaledViewport.width}px`; + canvas.style.height = `${scaledViewport.height}px`; + + const cacheKey = getPageModelCacheKey(cacheKeyPrefix, pageNum, targetScale); + const cachedModel = pageModelCache.get(cacheKey); + const modelPromise = cachedModel + ? Promise.resolve(cachedModel) + : extractPageModel(page, scaledViewport); + const renderTask = page.render({ + canvasContext: canvas.getContext('2d')!, + viewport: hiResViewport, + canvas, + }).promise; + + const [model] = await Promise.all([modelPromise, renderTask]); + + let finalModel = model; + + if (!cachedModel && pageState.useOcr && shouldUseOcrForModel(model)) { + showLoader(`Running OCR on page ${pageNum}...`); + const ocrModel = await recognizePageCanvas( + canvas, + pageState.ocrLanguage, + function (status, progress) { + showLoader(`OCR: ${status}`, progress * 100); + } + ); + finalModel = { + ...ocrModel, + pageNumber: pageNum, + }; + } + + pageModelCache.set(cacheKey, finalModel); + + return { model: finalModel, exists: true }; +} + +async function loadComparisonPage( + pdfDoc: pdfjsLib.PDFDocumentProxy | null, + pageNum: number | null, + side: 'left' | 'right', + renderTarget?: { + canvas: HTMLCanvasElement; + container: HTMLElement; + placeholderId: string; + } +): Promise { + if (!pdfDoc || !pageNum) { + if (renderTarget) { + renderMissingPage( + renderTarget.canvas, + renderTarget.placeholderId, + 'No paired page for this side.' + ); + } + return { model: null, exists: false }; + } + + if (renderTarget) { + return renderPage( + pdfDoc, + pageNum, + renderTarget.canvas, + renderTarget.container, + renderTarget.placeholderId, + side + ); + } + + const renderScale = 1.2; + const cacheKey = getPageModelCacheKey(side, pageNum, renderScale); + const cachedModel = pageModelCache.get(cacheKey); + if (cachedModel) { + return { model: cachedModel, exists: true }; + } + + const page = await pdfDoc.getPage(pageNum); + const viewport = page.getViewport({ scale: renderScale }); + const canvas = document.createElement('canvas'); + canvas.width = viewport.width; + canvas.height = viewport.height; + const context = canvas.getContext('2d'); + + if (!context) { + throw new Error('Could not create offscreen comparison canvas.'); + } + + const extractedModel = await extractPageModel(page, viewport); + await page.render({ + canvasContext: context, + viewport, + canvas, + }).promise; + + let finalModel = extractedModel; + if (pageState.useOcr && shouldUseOcrForModel(extractedModel)) { + const ocrModel = await recognizePageCanvas(canvas, pageState.ocrLanguage); + finalModel = { + ...ocrModel, + pageNumber: pageNum, + }; + } + + pageModelCache.set(cacheKey, finalModel); + return { model: finalModel, exists: true }; +} + +async function computeComparisonForPair( + pair: ComparePagePair, + options?: { + renderTargets?: { + left: { + canvas: HTMLCanvasElement; + container: HTMLElement; + placeholderId: string; + }; + right: { + canvas: HTMLCanvasElement; + container: HTMLElement; + placeholderId: string; + }; + diffCanvas?: HTMLCanvasElement; + }; + } +) { + const renderTargets = options?.renderTargets; + const leftPage = await loadComparisonPage( + pageState.pdfDoc1, + pair.leftPageNumber, + 'left', + renderTargets?.left + ); + const rightPage = await loadComparisonPage( + pageState.pdfDoc2, + pair.rightPageNumber, + 'right', + renderTargets?.right + ); + + const comparison = comparePageModels(leftPage.model, rightPage.model); + comparison.confidence = pair.confidence; + + if ( + renderTargets?.diffCanvas && + comparison.status !== 'left-only' && + comparison.status !== 'right-only' + ) { + const focusRegion = buildDiffFocusRegion( + comparison, + renderTargets.left.canvas, + renderTargets.right.canvas + ); + comparison.visualDiff = renderVisualDiff( + renderTargets.left.canvas, + renderTargets.right.canvas, + renderTargets.diffCanvas, + focusRegion + ); + } else if (renderTargets?.diffCanvas) { + clearCanvas(renderTargets.diffCanvas); + } + + return comparison; +} + +function getActivePair() { + return pageState.pagePairs[pageState.currentPage - 1] || null; +} + +function getVisibleChanges(result: ComparePageResult | null) { + if (!result) return []; + + const filteredByType = + pageState.activeFilter === 'all' + ? result.changes + : result.changes.filter((change) => { + if (pageState.activeFilter === 'removed') { + return change.type === 'removed' || change.type === 'page-removed'; + } + return change.type === pageState.activeFilter; + }); + + const searchQuery = pageState.changeSearchQuery.trim().toLowerCase(); + if (!searchQuery) { + return filteredByType; + } + + return filteredByType.filter((change) => { + const searchableText = [ + change.description, + change.beforeText, + change.afterText, + ] + .join(' ') + .toLowerCase(); + return searchableText.includes(searchQuery); + }); +} + +function updateFilterButtons() { + const pills: Array<{ id: string; filter: CompareFilterType }> = [ + { id: 'filter-modified', filter: 'modified' }, + { id: 'filter-added', filter: 'added' }, + { id: 'filter-removed', filter: 'removed' }, + ]; + + pills.forEach(({ id, filter }) => { + const button = getElement(id); + if (!button) return; + button.classList.toggle('active', pageState.activeFilter === filter); + }); +} + +function updateSummary() { + const comparison = pageState.currentComparison; + const addedCount = getElement('summary-added-count'); + const removedCount = getElement('summary-removed-count'); + const modifiedCount = getElement('summary-modified-count'); + const panelLabel1 = getElement('compare-panel-label-1'); + const panelLabel2 = getElement('compare-panel-label-2'); + + if (panelLabel1) panelLabel1.textContent = documentNames.left; + if (panelLabel2) panelLabel2.textContent = documentNames.right; + + if (!comparison) { + if (addedCount) addedCount.textContent = '0'; + if (removedCount) removedCount.textContent = '0'; + if (modifiedCount) modifiedCount.textContent = '0'; + return; + } + + if (addedCount) addedCount.textContent = comparison.summary.added.toString(); + if (removedCount) + removedCount.textContent = comparison.summary.removed.toString(); + if (modifiedCount) + modifiedCount.textContent = comparison.summary.modified.toString(); +} + +function renderHighlights() { + const highlightLayer1 = getElement('highlights-1'); + const highlightLayer2 = getElement('highlights-2'); + + if (!highlightLayer1 || !highlightLayer2) return; + + highlightLayer1.innerHTML = ''; + highlightLayer2.innerHTML = ''; + + const comparison = pageState.currentComparison; + if (!comparison) return; + + getVisibleChanges(comparison).forEach((change, index) => { + const activeClass = index === pageState.activeChangeIndex ? ' active' : ''; + change.beforeRects.forEach((rect) => { + const marker = document.createElement('div'); + marker.className = `compare-highlight ${change.type}${activeClass}`; + marker.style.left = `${rect.x}px`; + marker.style.top = `${rect.y}px`; + marker.style.width = `${rect.width}px`; + marker.style.height = `${rect.height}px`; + highlightLayer1.appendChild(marker); + }); + + change.afterRects.forEach((rect) => { + const marker = document.createElement('div'); + marker.className = `compare-highlight ${change.type}${activeClass}`; + marker.style.left = `${rect.x}px`; + marker.style.top = `${rect.y}px`; + marker.style.width = `${rect.width}px`; + marker.style.height = `${rect.height}px`; + highlightLayer2.appendChild(marker); + }); + }); +} + +function scrollToChange(change: CompareTextChange) { + const panel1 = getElement('panel-1'); + const panel2 = getElement('panel-2'); + const firstBefore = change.beforeRects[0]; + const firstAfter = change.afterRects[0]; + + if (panel1 && firstBefore) { + panel1.scrollTo({ + top: Math.max(firstBefore.y - 40, 0), + behavior: 'smooth', + }); + } + + if (panel2 && firstAfter) { + panel2.scrollTo({ + top: Math.max(firstAfter.y - 40, 0), + behavior: 'smooth', + }); + } +} + +function renderChangeList() { + const comparison = pageState.currentComparison; + const list = getElement('compare-change-list'); + const emptyState = getElement('change-list-empty'); + const prevChangeBtn = getElement('prev-change-btn'); + const nextChangeBtn = getElement('next-change-btn'); + const exportReportBtn = getElement('export-report-btn'); + + if ( + !list || + !emptyState || + !prevChangeBtn || + !nextChangeBtn || + !exportReportBtn + ) + return; + + list.innerHTML = ''; + const visibleChanges = getVisibleChanges(comparison); + + if (!comparison || visibleChanges.length === 0) { + emptyState.textContent = + comparison?.status === 'match' + ? 'No differences detected on this page.' + : 'No changes match the current filter.'; + emptyState.classList.remove('hidden'); + list.classList.add('hidden'); + prevChangeBtn.disabled = true; + nextChangeBtn.disabled = true; + exportReportBtn.disabled = pageState.pagePairs.length === 0; + return; + } + + emptyState.classList.add('hidden'); + list.classList.remove('hidden'); + + visibleChanges.forEach((change, index) => { + const item = document.createElement('div'); + item.className = `compare-change-item${index === pageState.activeChangeIndex ? ' active' : ''}`; + item.innerHTML = ` + +
    +
    ${change.description}
    +
    + ${change.type.replace('-', ' ')} + `; + + item.addEventListener('click', function () { + pageState.activeChangeIndex = index; + renderComparisonUI(); + scrollToChange(change); + }); + + list.appendChild(item); + }); + + prevChangeBtn.disabled = false; + nextChangeBtn.disabled = false; + exportReportBtn.disabled = pageState.pagePairs.length === 0; +} + +function renderComparisonUI() { + updateFilterButtons(); + renderHighlights(); + renderChangeList(); + updateSummary(); +} + +async function buildPagePairs() { + if (!pageState.pdfDoc1 || !pageState.pdfDoc2) return; + + showLoader('Building page pairing model...', 0); + + const leftSignatures = await extractDocumentSignatures( + pageState.pdfDoc1, + function (pageNumber, totalPages) { + showLoader( + `Indexing PDF 1 page ${pageNumber} of ${totalPages}...`, + (pageNumber / Math.max(totalPages * 2, 1)) * 100 + ); + } + ); + const rightSignatures = await extractDocumentSignatures( + pageState.pdfDoc2, + function (pageNumber, totalPages) { + showLoader( + `Indexing PDF 2 page ${pageNumber} of ${totalPages}...`, + 50 + (pageNumber / Math.max(totalPages * 2, 1)) * 100 + ); + } + ); + + pageState.pagePairs = pairPages(leftSignatures, rightSignatures); + pageState.currentPage = 1; +} + +async function buildReportResults() { + const results: ComparePageResult[] = []; + + for (const pair of pageState.pagePairs) { + const cached = comparisonResultsCache.get(pair.pairIndex); + if (cached) { + results.push(cached); + continue; + } + + const leftSignatureKey = pair.leftPageNumber + ? `left-${pair.leftPageNumber}` + : ''; + const rightSignatureKey = pair.rightPageNumber + ? `right-${pair.rightPageNumber}` + : ''; + const cachedResult = comparisonCache.get( + `${leftSignatureKey || 'none'}:${rightSignatureKey || 'none'}:${pageState.useOcr ? 'ocr' : 'no-ocr'}` + ); + if (cachedResult) { + results.push(cachedResult); + continue; + } + + const comparison = await computeComparisonForPair(pair); + comparisonCache.set( + `${leftSignatureKey || 'none'}:${rightSignatureKey || 'none'}:${pageState.useOcr ? 'ocr' : 'no-ocr'}`, + comparison + ); + comparisonResultsCache.set(pair.pairIndex, comparison); + results.push(comparison); + } + + return results; } async function renderBothPages() { - if (!pageState.pdfDoc1 || !pageState.pdfDoc2) return; + if (!pageState.pdfDoc1 || !pageState.pdfDoc2) return; - showLoader(`Loading page ${pageState.currentPage}...`); + const pair = getActivePair(); + if (!pair) return; - const canvas1 = document.getElementById('canvas-compare-1') as HTMLCanvasElement; - const canvas2 = document.getElementById('canvas-compare-2') as HTMLCanvasElement; - const panel1 = document.getElementById('panel-1') as HTMLElement; - const panel2 = document.getElementById('panel-2') as HTMLElement; - const wrapper = document.getElementById('compare-viewer-wrapper') as HTMLElement; + showLoader( + `Loading comparison ${pageState.currentPage} of ${pageState.pagePairs.length}...` + ); - const container1 = pageState.viewMode === 'overlay' ? wrapper : panel1; - const container2 = pageState.viewMode === 'overlay' ? wrapper : panel2; + const canvas1 = getElement( + 'canvas-compare-1' + ) as HTMLCanvasElement; + const canvas2 = getElement( + 'canvas-compare-2' + ) as HTMLCanvasElement; + const panel1 = getElement('panel-1') as HTMLElement; + const panel2 = getElement('panel-2') as HTMLElement; + const wrapper = getElement( + 'compare-viewer-wrapper' + ) as HTMLElement; - await Promise.all([ - renderPage( - pageState.pdfDoc1, - Math.min(pageState.currentPage, pageState.pdfDoc1.numPages), - canvas1, - container1 - ), - renderPage( - pageState.pdfDoc2, - Math.min(pageState.currentPage, pageState.pdfDoc2.numPages), - canvas2, - container2 - ), - ]); + const container1 = panel1; + const container2 = pageState.viewMode === 'overlay' ? panel1 : panel2; - updateNavControls(); - hideLoader(); + const comparison = await computeComparisonForPair(pair, { + renderTargets: { + left: { + canvas: canvas1, + container: container1, + placeholderId: 'placeholder-1', + }, + right: { + canvas: canvas2, + container: container2, + placeholderId: 'placeholder-2', + }, + }, + }); + + pageState.currentComparison = comparison; + pageState.activeChangeIndex = 0; + + updateNavControls(); + renderComparisonUI(); + hideLoader(); } function updateNavControls() { - const maxPages = Math.max( - pageState.pdfDoc1?.numPages || 0, - pageState.pdfDoc2?.numPages || 0 + const totalPairs = + pageState.pagePairs.length || + Math.max( + pageState.pdfDoc1?.numPages || 0, + pageState.pdfDoc2?.numPages || 0 ); - const currentDisplay = document.getElementById('current-page-display-compare'); - const totalDisplay = document.getElementById('total-pages-display-compare'); - const prevBtn = document.getElementById('prev-page-compare') as HTMLButtonElement; - const nextBtn = document.getElementById('next-page-compare') as HTMLButtonElement; + const currentDisplay = document.getElementById( + 'current-page-display-compare' + ); + const totalDisplay = document.getElementById('total-pages-display-compare'); + const prevBtn = document.getElementById( + 'prev-page-compare' + ) as HTMLButtonElement; + const nextBtn = document.getElementById( + 'next-page-compare' + ) as HTMLButtonElement; - if (currentDisplay) currentDisplay.textContent = pageState.currentPage.toString(); - if (totalDisplay) totalDisplay.textContent = maxPages.toString(); - if (prevBtn) prevBtn.disabled = pageState.currentPage <= 1; - if (nextBtn) nextBtn.disabled = pageState.currentPage >= maxPages; + if (currentDisplay) + currentDisplay.textContent = pageState.currentPage.toString(); + if (totalDisplay) totalDisplay.textContent = totalPairs.toString(); + if (prevBtn) prevBtn.disabled = pageState.currentPage <= 1; + if (nextBtn) nextBtn.disabled = pageState.currentPage >= totalPairs; } function setViewMode(mode: 'overlay' | 'side-by-side') { - pageState.viewMode = mode; - const wrapper = document.getElementById('compare-viewer-wrapper'); - const overlayControls = document.getElementById('overlay-controls'); - const sideControls = document.getElementById('side-by-side-controls'); - const btnOverlay = document.getElementById('view-mode-overlay'); - const btnSide = document.getElementById('view-mode-side'); - const canvas2 = document.getElementById('canvas-compare-2') as HTMLCanvasElement; - const opacitySlider = document.getElementById('opacity-slider') as HTMLInputElement; + pageState.viewMode = mode; + const wrapper = document.getElementById('compare-viewer-wrapper'); + const overlayControls = document.getElementById('overlay-controls'); + const sideControls = document.getElementById('side-by-side-controls'); + const btnOverlay = document.getElementById('view-mode-overlay'); + const btnSide = document.getElementById('view-mode-side'); + const canvas2 = getElement( + 'canvas-compare-2' + ) as HTMLCanvasElement; + const opacitySlider = getElement( + 'opacity-slider' + ) as HTMLInputElement; - if (mode === 'overlay') { - if (wrapper) wrapper.className = 'compare-viewer-wrapper overlay-mode bg-gray-900 rounded-lg border border-gray-700 min-h-[400px] relative'; - if (overlayControls) overlayControls.classList.remove('hidden'); - if (sideControls) sideControls.classList.add('hidden'); - if (btnOverlay) { - btnOverlay.classList.add('bg-indigo-600'); - btnOverlay.classList.remove('bg-gray-700'); - } - if (btnSide) { - btnSide.classList.remove('bg-indigo-600'); - btnSide.classList.add('bg-gray-700'); - } - if (canvas2 && opacitySlider) canvas2.style.opacity = opacitySlider.value; - } else { - if (wrapper) wrapper.className = 'compare-viewer-wrapper side-by-side-mode bg-gray-900 rounded-lg border border-gray-700 min-h-[400px]'; - if (overlayControls) overlayControls.classList.add('hidden'); - if (sideControls) sideControls.classList.remove('hidden'); - if (btnOverlay) { - btnOverlay.classList.remove('bg-indigo-600'); - btnOverlay.classList.add('bg-gray-700'); - } - if (btnSide) { - btnSide.classList.add('bg-indigo-600'); - btnSide.classList.remove('bg-gray-700'); - } - if (canvas2) canvas2.style.opacity = '1'; + if (mode === 'overlay') { + if (wrapper) + wrapper.className = + 'compare-viewer-wrapper overlay-mode border border-slate-200'; + if (overlayControls) overlayControls.classList.remove('hidden'); + if (sideControls) sideControls.classList.add('hidden'); + if (btnOverlay) { + btnOverlay.classList.add('bg-indigo-600'); + btnOverlay.classList.remove('bg-gray-700'); } + if (btnSide) { + btnSide.classList.remove('bg-indigo-600'); + btnSide.classList.add('bg-gray-700'); + } + if (canvas2 && opacitySlider) { + const panel2 = getElement('panel-2'); + if (panel2) panel2.style.opacity = opacitySlider.value; + } + pageState.isSyncScroll = true; + } else { + if (wrapper) + wrapper.className = + 'compare-viewer-wrapper side-by-side-mode border border-slate-200'; + if (overlayControls) overlayControls.classList.add('hidden'); + if (sideControls) sideControls.classList.remove('hidden'); + if (btnOverlay) { + btnOverlay.classList.remove('bg-indigo-600'); + btnOverlay.classList.add('bg-gray-700'); + } + if (btnSide) { + btnSide.classList.add('bg-indigo-600'); + btnSide.classList.remove('bg-gray-700'); + } + if (canvas2) canvas2.style.opacity = '1'; + const panel2 = getElement('panel-2'); + if (panel2) panel2.style.opacity = '1'; + } + + const p1 = getElement('panel-1'); + const p2 = getElement('panel-2'); + if (mode === 'overlay' && p1 && p2) { + p2.scrollTop = p1.scrollTop; + p2.scrollLeft = p1.scrollLeft; + } + + if (pageState.pdfDoc1 && pageState.pdfDoc2) { renderBothPages(); + } } -async function handleFileInput(inputId: string, docKey: 'pdfDoc1' | 'pdfDoc2', displayId: string) { - const fileInput = document.getElementById(inputId) as HTMLInputElement; - const dropZone = document.getElementById(`drop-zone-${inputId.slice(-1)}`); +async function handleFileInput( + inputId: string, + docKey: 'pdfDoc1' | 'pdfDoc2', + displayId: string +) { + const fileInput = document.getElementById(inputId) as HTMLInputElement; + const dropZone = document.getElementById(`drop-zone-${inputId.slice(-1)}`); - async function handleFile(file: File) { - if (!file || file.type !== 'application/pdf') { - showAlert('Invalid File', 'Please select a valid PDF file.'); - return; - } - - const displayDiv = document.getElementById(displayId); - if (displayDiv) { - displayDiv.innerHTML = ''; - - const icon = document.createElement('i'); - icon.setAttribute('data-lucide', 'check-circle'); - icon.className = 'w-10 h-10 mb-3 text-green-500'; - - const p = document.createElement('p'); - p.className = 'text-sm text-gray-300 truncate'; - p.textContent = file.name; - - displayDiv.append(icon, p); - createIcons({ icons }); - } - - try { - showLoader(`Loading ${file.name}...`); - const arrayBuffer = await file.arrayBuffer(); - pageState[docKey] = await getPDFDocument({ data: arrayBuffer }).promise; - - if (pageState.pdfDoc1 && pageState.pdfDoc2) { - const compareViewer = document.getElementById('compare-viewer'); - if (compareViewer) compareViewer.classList.remove('hidden'); - pageState.currentPage = 1; - await renderBothPages(); - } - } catch (e) { - showAlert('Error', 'Could not load PDF. It may be corrupt or password-protected.'); - console.error(e); - } finally { - hideLoader(); - } + async function handleFile(file: File) { + if (!file || file.type !== 'application/pdf') { + showAlert('Invalid File', 'Please select a valid PDF file.'); + return; } - if (fileInput) { - fileInput.addEventListener('change', function (e) { - const files = (e.target as HTMLInputElement).files; - if (files && files[0]) handleFile(files[0]); - }); + const displayDiv = document.getElementById(displayId); + if (displayDiv) { + displayDiv.innerHTML = ''; + + const icon = document.createElement('i'); + icon.setAttribute('data-lucide', 'check-circle'); + icon.className = 'w-10 h-10 mb-3 text-green-500'; + + const p = document.createElement('p'); + p.className = 'text-sm text-gray-300 truncate'; + p.textContent = file.name; + + if (docKey === 'pdfDoc1') documentNames.left = file.name; + if (docKey === 'pdfDoc2') documentNames.right = file.name; + + const panelLabel1 = getElement('compare-panel-label-1'); + const panelLabel2 = getElement('compare-panel-label-2'); + if (docKey === 'pdfDoc1' && panelLabel1) + panelLabel1.textContent = file.name; + if (docKey === 'pdfDoc2' && panelLabel2) + panelLabel2.textContent = file.name; + + displayDiv.append(icon, p); + createIcons({ icons }); } - if (dropZone) { - dropZone.addEventListener('dragover', function (e) { - e.preventDefault(); - }); - dropZone.addEventListener('drop', function (e) { - e.preventDefault(); - const files = e.dataTransfer?.files; - if (files && files[0]) handleFile(files[0]); - }); + try { + showLoader(`Loading ${file.name}...`); + const arrayBuffer = await file.arrayBuffer(); + pageState[docKey] = await getPDFDocument({ data: arrayBuffer }).promise; + pageModelCache.clear(); + comparisonCache.clear(); + comparisonResultsCache.clear(); + pageState.changeSearchQuery = ''; + + const searchInput = getElement('compare-search-input'); + if (searchInput) { + searchInput.value = ''; + } + + if (pageState.pdfDoc1 && pageState.pdfDoc2) { + const compareViewer = document.getElementById('compare-viewer'); + if (compareViewer) compareViewer.classList.remove('hidden'); + await buildPagePairs(); + await renderBothPages(); + } + } catch (e) { + showAlert( + 'Error', + 'Could not load PDF. It may be corrupt or password-protected.' + ); + console.error(e); + } finally { + hideLoader(); } + } + + if (fileInput) { + fileInput.addEventListener('change', function (e) { + const files = (e.target as HTMLInputElement).files; + if (files && files[0]) handleFile(files[0]); + }); + } + + if (dropZone) { + dropZone.addEventListener('dragover', function (e) { + e.preventDefault(); + }); + dropZone.addEventListener('drop', function (e) { + e.preventDefault(); + const files = e.dataTransfer?.files; + if (files && files[0]) handleFile(files[0]); + }); + } } document.addEventListener('DOMContentLoaded', function () { - const backBtn = document.getElementById('back-to-tools'); + const backBtn = getElement('back-to-tools'); - if (backBtn) { - backBtn.addEventListener('click', function () { - window.location.href = import.meta.env.BASE_URL; - }); + if (backBtn) { + backBtn.addEventListener('click', function () { + window.location.href = import.meta.env.BASE_URL; + }); + } + + handleFileInput('file-input-1', 'pdfDoc1', 'file-display-1'); + handleFileInput('file-input-2', 'pdfDoc2', 'file-display-2'); + + const prevBtn = getElement('prev-page-compare'); + const nextBtn = getElement('next-page-compare'); + + if (prevBtn) { + prevBtn.addEventListener('click', function () { + if (pageState.currentPage > 1) { + pageState.currentPage--; + renderBothPages(); + } + }); + } + + if (nextBtn) { + nextBtn.addEventListener('click', function () { + const totalPairs = + pageState.pagePairs.length || + Math.max( + pageState.pdfDoc1?.numPages || 0, + pageState.pdfDoc2?.numPages || 0 + ); + if (pageState.currentPage < totalPairs) { + pageState.currentPage++; + renderBothPages(); + } + }); + } + + const btnOverlay = getElement('view-mode-overlay'); + const btnSide = getElement('view-mode-side'); + + if (btnOverlay) { + btnOverlay.addEventListener('click', function () { + setViewMode('overlay'); + }); + } + + if (btnSide) { + btnSide.addEventListener('click', function () { + setViewMode('side-by-side'); + }); + } + + const flickerBtn = getElement('flicker-btn'); + const canvas2 = getElement( + 'canvas-compare-2' + ) as HTMLCanvasElement; + const opacitySlider = getElement( + 'opacity-slider' + ) as HTMLInputElement; + + // Track flicker state + let flickerVisible = true; + + if (flickerBtn) { + flickerBtn.addEventListener('click', function () { + flickerVisible = !flickerVisible; + const p2 = getElement('panel-2'); + if (p2) { + p2.style.transition = 'opacity 150ms ease-in-out'; + p2.style.opacity = flickerVisible ? opacitySlider?.value || '0.5' : '0'; + } + }); + } + + if (opacitySlider) { + opacitySlider.addEventListener('input', function () { + flickerVisible = true; + const p2 = getElement('panel-2'); + if (p2) { + p2.style.transition = ''; + p2.style.opacity = opacitySlider.value; + } + }); + } + + const panel1 = getElement('panel-1'); + const panel2 = getElement('panel-2'); + const syncToggle = getElement( + 'sync-scroll-toggle' + ) as HTMLInputElement; + const prevChangeBtn = getElement('prev-change-btn'); + const nextChangeBtn = getElement('next-change-btn'); + const exportReportBtn = getElement('export-report-btn'); + const ocrToggle = getElement('ocr-toggle'); + const searchInput = getElement('compare-search-input'); + + const filterButtons: Array<{ id: string; filter: CompareFilterType }> = [ + { id: 'filter-modified', filter: 'modified' }, + { id: 'filter-added', filter: 'added' }, + { id: 'filter-removed', filter: 'removed' }, + ]; + + if (syncToggle) { + syncToggle.addEventListener('change', function () { + pageState.isSyncScroll = syncToggle.checked; + }); + } + + let scrollingPanel: HTMLElement | null = null; + + if (panel1 && panel2) { + panel1.addEventListener('scroll', function () { + if (pageState.isSyncScroll && scrollingPanel !== panel2) { + scrollingPanel = panel1; + panel2.scrollTop = panel1.scrollTop; + panel2.scrollLeft = panel1.scrollLeft; + setTimeout(function () { + scrollingPanel = null; + }, 100); + } + }); + + panel2.addEventListener('scroll', function () { + if (pageState.viewMode === 'overlay') return; + if (pageState.isSyncScroll && scrollingPanel !== panel1) { + scrollingPanel = panel2; + panel1.scrollTop = panel2.scrollTop; + panel1.scrollLeft = panel2.scrollLeft; + setTimeout(function () { + scrollingPanel = null; + }, 100); + } + }); + } + + if (prevChangeBtn) { + prevChangeBtn.addEventListener('click', function () { + const changes = getVisibleChanges(pageState.currentComparison); + if (changes.length === 0) return; + pageState.activeChangeIndex = + (pageState.activeChangeIndex - 1 + changes.length) % changes.length; + renderComparisonUI(); + scrollToChange(changes[pageState.activeChangeIndex]); + }); + } + + if (nextChangeBtn) { + nextChangeBtn.addEventListener('click', function () { + const changes = getVisibleChanges(pageState.currentComparison); + if (changes.length === 0) return; + pageState.activeChangeIndex = + (pageState.activeChangeIndex + 1) % changes.length; + renderComparisonUI(); + scrollToChange(changes[pageState.activeChangeIndex]); + }); + } + + filterButtons.forEach(({ id, filter }) => { + const button = getElement(id); + if (!button) return; + button.addEventListener('click', function () { + if (pageState.activeFilter === filter) { + pageState.activeFilter = 'all'; + } else { + pageState.activeFilter = filter; + } + pageState.activeChangeIndex = 0; + renderComparisonUI(); + }); + }); + + if (ocrToggle) { + ocrToggle.checked = pageState.useOcr; + ocrToggle.addEventListener('change', async function () { + pageState.useOcr = ocrToggle.checked; + pageModelCache.clear(); + comparisonCache.clear(); + comparisonResultsCache.clear(); + if (pageState.pdfDoc1 && pageState.pdfDoc2) { + await renderBothPages(); + } + }); + } + + if (searchInput) { + searchInput.addEventListener('input', function () { + pageState.changeSearchQuery = searchInput.value; + pageState.activeChangeIndex = 0; + renderComparisonUI(); + }); + } + + let resizeFrame = 0; + window.addEventListener('resize', function () { + if (!pageState.pdfDoc1 || !pageState.pdfDoc2) { + return; } - handleFileInput('file-input-1', 'pdfDoc1', 'file-display-1'); - handleFileInput('file-input-2', 'pdfDoc2', 'file-display-2'); + window.cancelAnimationFrame(resizeFrame); + resizeFrame = window.requestAnimationFrame(function () { + renderBothPages(); + }); + }); - const prevBtn = document.getElementById('prev-page-compare'); - const nextBtn = document.getElementById('next-page-compare'); + if (exportReportBtn) { + exportReportBtn.addEventListener('click', async function () { + if (pageState.pagePairs.length === 0) return; + showLoader('Building compare report...'); + const results = await buildReportResults(); + exportCompareHtmlReport( + documentNames.left, + documentNames.right, + pageState.pagePairs, + results + ); + hideLoader(); + }); + } - if (prevBtn) { - prevBtn.addEventListener('click', function () { - if (pageState.currentPage > 1) { - pageState.currentPage--; - renderBothPages(); - } - }); - } - - if (nextBtn) { - nextBtn.addEventListener('click', function () { - const maxPages = Math.max( - pageState.pdfDoc1?.numPages || 0, - pageState.pdfDoc2?.numPages || 0 - ); - if (pageState.currentPage < maxPages) { - pageState.currentPage++; - renderBothPages(); - } - }); - } - - const btnOverlay = document.getElementById('view-mode-overlay'); - const btnSide = document.getElementById('view-mode-side'); - - if (btnOverlay) { - btnOverlay.addEventListener('click', function () { - setViewMode('overlay'); - }); - } - - if (btnSide) { - btnSide.addEventListener('click', function () { - setViewMode('side-by-side'); - }); - } - - const flickerBtn = document.getElementById('flicker-btn'); - const canvas2 = document.getElementById('canvas-compare-2') as HTMLCanvasElement; - const opacitySlider = document.getElementById('opacity-slider') as HTMLInputElement; - - // Track flicker state - let flickerVisible = true; - - if (flickerBtn && canvas2) { - flickerBtn.addEventListener('click', function () { - flickerVisible = !flickerVisible; - canvas2.style.transition = 'opacity 150ms ease-in-out'; - canvas2.style.opacity = flickerVisible ? (opacitySlider?.value || '0.5') : '0'; - }); - } - - if (opacitySlider && canvas2) { - opacitySlider.addEventListener('input', function () { - flickerVisible = true; // Reset flicker state when slider changes - canvas2.style.transition = ''; - canvas2.style.opacity = opacitySlider.value; - }); - } - - const panel1 = document.getElementById('panel-1'); - const panel2 = document.getElementById('panel-2'); - const syncToggle = document.getElementById('sync-scroll-toggle') as HTMLInputElement; - - if (syncToggle) { - syncToggle.addEventListener('change', function () { - pageState.isSyncScroll = syncToggle.checked; - }); - } - - let scrollingPanel: HTMLElement | null = null; - - if (panel1 && panel2) { - panel1.addEventListener('scroll', function () { - if (pageState.isSyncScroll && scrollingPanel !== panel2) { - scrollingPanel = panel1; - panel2.scrollTop = panel1.scrollTop; - setTimeout(function () { scrollingPanel = null; }, 100); - } - }); - - panel2.addEventListener('scroll', function () { - if (pageState.isSyncScroll && scrollingPanel !== panel1) { - scrollingPanel = panel2; - panel1.scrollTop = panel2.scrollTop; - setTimeout(function () { scrollingPanel = null; }, 100); - } - }); - } - - createIcons({ icons }); + createIcons({ icons }); + updateFilterButtons(); + setViewMode(pageState.viewMode); }); diff --git a/src/js/types/compare-pdfs-type.ts b/src/js/types/compare-pdfs-type.ts index de54566..d3fa47c 100644 --- a/src/js/types/compare-pdfs-type.ts +++ b/src/js/types/compare-pdfs-type.ts @@ -1,9 +1 @@ -import * as pdfjsLib from 'pdfjs-dist'; - -export interface CompareState { - pdfDoc1: pdfjsLib.PDFDocumentProxy | null; - pdfDoc2: pdfjsLib.PDFDocumentProxy | null; - currentPage: number; - viewMode: 'overlay' | 'side-by-side'; - isSyncScroll: boolean; -} +export type { CompareState } from '../compare/types.ts'; diff --git a/src/pages/compare-pdfs.html b/src/pages/compare-pdfs.html index 2bea2ca..43f0622 100644 --- a/src/pages/compare-pdfs.html +++ b/src/pages/compare-pdfs.html @@ -72,31 +72,362 @@ @@ -134,7 +465,7 @@ >
    - - Page 1 of + + Page 1 / 1 - - +
    - -
    +
    - - +
    - -
    + + + +
    + + - - - -
    -
    - -
    -
    - +
    +
    +
    +
    + Original +
    +
    + +
    + +
    +
    +
    +
    + Modified +
    +
    + +
    + +
    +
    + +
    diff --git a/src/tests/compare/diff-text-runs.test.ts b/src/tests/compare/diff-text-runs.test.ts new file mode 100644 index 0000000..2e8db58 --- /dev/null +++ b/src/tests/compare/diff-text-runs.test.ts @@ -0,0 +1,313 @@ +import { describe, expect, it } from 'vitest'; + +import { comparePageModels } from '@/js/compare/engine/compare-page-models.ts'; +import { diffTextRuns } from '@/js/compare/engine/diff-text-runs.ts'; +import { + mergeIntoLines, + sortCompareTextItems, +} from '@/js/compare/engine/extract-page-model.ts'; +import type { ComparePageModel, CompareTextItem } from '@/js/compare/types.ts'; + +function makeItem(id: string, text: string): CompareTextItem { + return { + id, + text, + normalizedText: text, + rect: { x: 0, y: 0, width: 10, height: 10 }, + }; +} + +function makePage( + pageNumber: number, + textItems: CompareTextItem[] +): ComparePageModel { + return { + pageNumber, + width: 100, + height: 100, + textItems, + plainText: textItems.map((item) => item.normalizedText).join(' '), + hasText: textItems.length > 0, + source: 'pdfjs', + }; +} + +describe('diffTextRuns', () => { + it('detects modified tokens as one change', () => { + const result = diffTextRuns( + [makeItem('a', 'Hello'), makeItem('b', 'world')], + [makeItem('a', 'Hello'), makeItem('c', 'there')] + ); + + expect(result.summary).toEqual({ added: 0, removed: 0, modified: 1 }); + expect(result.changes).toHaveLength(1); + expect(result.changes[0].type).toBe('modified'); + expect(result.changes[0].beforeText).toBe('world'); + expect(result.changes[0].afterText).toBe('there'); + }); + + it('detects added tokens', () => { + const result = diffTextRuns( + [makeItem('a', 'Hello')], + [makeItem('a', 'Hello'), makeItem('b', 'again')] + ); + + expect(result.summary).toEqual({ added: 1, removed: 0, modified: 0 }); + expect(result.changes[0].type).toBe('added'); + }); + + it('splits compound replacements into discrete changes', () => { + const result = diffTextRuns( + [ + makeItem('a', 'This'), + makeItem('b', 'is'), + makeItem('c', 'an'), + makeItem('d', 'example'), + makeItem('e', 'of'), + makeItem('f', 'a'), + makeItem('g', 'data'), + makeItem('h', 'table'), + makeItem('i', 'new.'), + makeItem('j', 'Disabilit'), + ], + [ + makeItem('k', 'Example'), + makeItem('l', 'table'), + makeItem('m', 'This'), + makeItem('n', 'is'), + makeItem('o', 'an'), + makeItem('p', 'example'), + makeItem('q', 'of'), + makeItem('r', 'a'), + makeItem('s', 'data'), + makeItem('t', 'table.'), + makeItem('u', 'Disability'), + ] + ); + + expect(result.changes).toHaveLength(2); + expect(result.summary).toEqual({ added: 1, removed: 0, modified: 1 }); + expect( + result.changes.some( + (change) => + change.type === 'added' && change.afterText === 'Example table' + ) + ).toBe(true); + expect( + result.changes.some( + (change) => + change.type === 'modified' && + change.beforeText === 'table new. Disabilit' && + change.afterText === 'table. Disability' + ) + ).toBe(true); + }); +}); + +describe('comparePageModels', () => { + it('marks pages missing from the second document', () => { + const result = comparePageModels( + makePage(3, [makeItem('a', 'Only')]), + null + ); + + expect(result.status).toBe('left-only'); + expect(result.summary.removed).toBe(1); + expect(result.changes[0].type).toBe('page-removed'); + }); +}); + +describe('sortCompareTextItems', () => { + it('orders tokens by reading order', () => { + const items: CompareTextItem[] = [ + { + ...makeItem('b', 'Body'), + rect: { x: 60, y: 40, width: 10, height: 10 }, + }, + { + ...makeItem('a', 'Title'), + rect: { x: 10, y: 10, width: 10, height: 10 }, + }, + { + ...makeItem('c', 'Next'), + rect: { x: 10, y: 40, width: 10, height: 10 }, + }, + ]; + + expect( + sortCompareTextItems(items).map((item) => item.normalizedText) + ).toEqual(['Title', 'Next', 'Body']); + }); +}); + +describe('mergeIntoLines', () => { + it('merges items on the same Y-line into one item', () => { + const items: CompareTextItem[] = [ + { + id: '0', + text: 'Hello', + normalizedText: 'Hello', + rect: { x: 0, y: 10, width: 50, height: 12 }, + }, + { + id: '1', + text: 'World', + normalizedText: 'World', + rect: { x: 60, y: 10, width: 50, height: 12 }, + }, + ]; + const merged = mergeIntoLines(sortCompareTextItems(items)); + + expect(merged).toHaveLength(1); + expect(merged[0].normalizedText).toBe('Hello World'); + expect(merged[0].rect.x).toBe(0); + expect(merged[0].rect.width).toBe(110); + }); + + it('does not insert spaces inside a split word', () => { + const items: CompareTextItem[] = [ + { + id: '0', + text: 'sam', + normalizedText: 'sam', + rect: { x: 0, y: 10, width: 24, height: 12 }, + }, + { + id: '1', + text: 'e', + normalizedText: 'e', + rect: { x: 24.4, y: 10, width: 8, height: 12 }, + }, + ]; + + const merged = mergeIntoLines(sortCompareTextItems(items)); + + expect(merged).toHaveLength(1); + expect(merged[0].normalizedText).toBe('same'); + }); + + it('keeps items on different Y-lines separate', () => { + const items: CompareTextItem[] = [ + { + id: '0', + text: 'Line 1', + normalizedText: 'Line 1', + rect: { x: 0, y: 10, width: 50, height: 12 }, + }, + { + id: '1', + text: 'Line 2', + normalizedText: 'Line 2', + rect: { x: 0, y: 30, width: 50, height: 12 }, + }, + ]; + const merged = mergeIntoLines(sortCompareTextItems(items)); + + expect(merged).toHaveLength(2); + expect(merged[0].normalizedText).toBe('Line 1'); + expect(merged[1].normalizedText).toBe('Line 2'); + }); + + it('produces same result for different text run boundaries', () => { + const pdf1Items: CompareTextItem[] = [ + { + id: '0', + text: 'Hello World', + normalizedText: 'Hello World', + rect: { x: 0, y: 10, width: 100, height: 12 }, + }, + ]; + const pdf2Items: CompareTextItem[] = [ + { + id: '0', + text: 'Hello', + normalizedText: 'Hello', + rect: { x: 0, y: 10, width: 45, height: 12 }, + }, + { + id: '1', + text: 'World', + normalizedText: 'World', + rect: { x: 55, y: 10, width: 45, height: 12 }, + }, + ]; + + const merged1 = mergeIntoLines(sortCompareTextItems(pdf1Items)); + const merged2 = mergeIntoLines(sortCompareTextItems(pdf2Items)); + + expect(merged1[0].normalizedText).toBe(merged2[0].normalizedText); + + const result = diffTextRuns(merged1, merged2); + expect(result.changes).toHaveLength(0); + }); + + it('detects actual changes after merging', () => { + const pdf1Items: CompareTextItem[] = [ + { + id: '0', + text: 'Sample', + normalizedText: 'Sample', + rect: { x: 0, y: 10, width: 60, height: 14 }, + }, + { + id: '1', + text: 'page text here', + normalizedText: 'page text here', + rect: { x: 0, y: 30, width: 120, height: 14 }, + }, + ]; + const pdf2Items: CompareTextItem[] = [ + { + id: '0', + text: 'Sample', + normalizedText: 'Sample', + rect: { x: 0, y: 10, width: 45, height: 14 }, + }, + { + id: '1', + text: 'PDF', + normalizedText: 'PDF', + rect: { x: 55, y: 10, width: 30, height: 14 }, + }, + { + id: '2', + text: 'pages text here', + normalizedText: 'pages text here', + rect: { x: 0, y: 30, width: 125, height: 14 }, + }, + ]; + + const merged1 = mergeIntoLines(sortCompareTextItems(pdf1Items)); + const merged2 = mergeIntoLines(sortCompareTextItems(pdf2Items)); + + expect(merged1).toHaveLength(2); + expect(merged2).toHaveLength(2); + + const result = diffTextRuns(merged1, merged2); + expect(result.summary.modified).toBe(1); + expect(result.summary.added).toBe(0); + expect(result.summary.removed).toBe(0); + expect(result.changes).toHaveLength(1); + expect(result.changes[0].beforeText).toBe('page'); + expect(result.changes[0].afterText).toBe('PDF pages'); + }); + + it('preserves original casing in change descriptions', () => { + const result = diffTextRuns( + [makeItem('a', 'Sample')], + [makeItem('b', 'Sample PDF')] + ); + + expect(result.changes[0].afterText).toBe('PDF'); + }); + + it('ignores joined versus split words when collapsed text matches', () => { + const result = diffTextRuns( + [makeItem('a', 'non'), makeItem('b', 'tincidunt')], + [makeItem('c', 'nontincidunt')] + ); + + expect(result.changes).toHaveLength(0); + expect(result.summary).toEqual({ added: 0, removed: 0, modified: 0 }); + }); +}); diff --git a/src/tests/compare/pair-pages.test.ts b/src/tests/compare/pair-pages.test.ts new file mode 100644 index 0000000..10b83be --- /dev/null +++ b/src/tests/compare/pair-pages.test.ts @@ -0,0 +1,42 @@ +import { describe, expect, it } from 'vitest'; + +import { pairPages } from '@/js/compare/engine/pair-pages.ts'; +import type { ComparePageSignature } from '@/js/compare/types.ts'; + +function signature(pageNumber: number, text: string): ComparePageSignature { + return { + pageNumber, + plainText: text, + hasText: text.length > 0, + tokenItems: text + .split(/\s+/) + .filter(Boolean) + .map((token, index) => ({ + id: `${pageNumber}-${index}`, + text: token, + normalizedText: token, + rect: { x: 0, y: 0, width: 0, height: 0 }, + })), + }; +} + +describe('pairPages', () => { + it('pairs reordered and inserted pages without collapsing alignment', () => { + const pairs = pairPages( + [signature(1, 'alpha beta'), signature(2, 'gamma delta')], + [ + signature(1, 'intro page'), + signature(2, 'alpha beta'), + signature(3, 'gamma delta'), + ] + ); + + expect(pairs).toHaveLength(3); + expect(pairs[0]).toMatchObject({ + leftPageNumber: null, + rightPageNumber: 1, + }); + expect(pairs[1]).toMatchObject({ leftPageNumber: 1, rightPageNumber: 2 }); + expect(pairs[2]).toMatchObject({ leftPageNumber: 2, rightPageNumber: 3 }); + }); +}); diff --git a/src/tests/compare/text-normalization.test.ts b/src/tests/compare/text-normalization.test.ts new file mode 100644 index 0000000..6d06e2d --- /dev/null +++ b/src/tests/compare/text-normalization.test.ts @@ -0,0 +1,29 @@ +import { describe, expect, it } from 'vitest'; + +import { + isLowQualityExtractedText, + joinNormalizedText, + normalizeCompareText, +} from '@/js/compare/engine/text-normalization.ts'; + +describe('text normalization', () => { + it('joins punctuation without inserting stray spaces', () => { + expect(joinNormalizedText(['Example', 'table', ':', 'v2'])).toBe( + 'Example table: v2' + ); + expect(joinNormalizedText(['"', 'Quoted', 'text', '"'])).toBe( + '"Quoted text"' + ); + }); + + it('normalizes private-use and control characters away', () => { + expect(normalizeCompareText('A\u0000B\uE000C')).toBe('A B C'); + }); + + it('flags punctuation-heavy extraction as low quality', () => { + expect(isLowQualityExtractedText('! " # $ % & \'')).toBe(true); + expect(isLowQualityExtractedText('Example table 2026 revision')).toBe( + false + ); + }); +});