refactor: update PDF comparison types and enhance UI for better usability

- Refactored CompareState to import from a centralized type definition.
- Enhanced the compare-pdfs.html layout with improved styles for overlay and side-by-side modes.
- Added new CSS styles for various UI components including panels, buttons, and highlights.
- Implemented a new sidebar for displaying change summaries and filters.
- Created unit tests for text comparison logic, including diffing text runs and page pairing.
- Added tests for text normalization functions to ensure proper handling of punctuation and character normalization.
This commit is contained in:
alam00000
2026-03-08 23:55:04 +05:30
parent 86cbaf6cd3
commit 1d68691331
20 changed files with 3447 additions and 332 deletions

View File

@@ -0,0 +1,78 @@
import type { ComparePageModel, ComparePageResult } from '../types.ts';
import { diffTextRuns } from './diff-text-runs.ts';
export function comparePageModels(
leftPage: ComparePageModel | null,
rightPage: ComparePageModel | null
): ComparePageResult {
if (leftPage && !rightPage) {
return {
status: 'left-only',
leftPageNumber: leftPage.pageNumber,
rightPageNumber: null,
changes: [
{
id: 'page-removed',
type: 'page-removed',
description: `Page ${leftPage.pageNumber} exists only in the first PDF.`,
beforeText: leftPage.plainText.slice(0, 200),
afterText: '',
beforeRects: [],
afterRects: [],
},
],
summary: { added: 0, removed: 1, modified: 0 },
visualDiff: null,
usedOcr: leftPage.source === 'ocr',
};
}
if (!leftPage && rightPage) {
return {
status: 'right-only',
leftPageNumber: null,
rightPageNumber: rightPage.pageNumber,
changes: [
{
id: 'page-added',
type: 'page-added',
description: `Page ${rightPage.pageNumber} exists only in the second PDF.`,
beforeText: '',
afterText: rightPage.plainText.slice(0, 200),
beforeRects: [],
afterRects: [],
},
],
summary: { added: 1, removed: 0, modified: 0 },
visualDiff: null,
usedOcr: rightPage.source === 'ocr',
};
}
if (!leftPage || !rightPage) {
return {
status: 'match',
leftPageNumber: null,
rightPageNumber: null,
changes: [],
summary: { added: 0, removed: 0, modified: 0 },
visualDiff: null,
usedOcr: false,
};
}
const { changes, summary } = diffTextRuns(
leftPage.textItems,
rightPage.textItems
);
return {
status: changes.length > 0 ? 'changed' : 'match',
leftPageNumber: leftPage.pageNumber,
rightPageNumber: rightPage.pageNumber,
changes,
summary,
visualDiff: null,
usedOcr: leftPage.source === 'ocr' || rightPage.source === 'ocr',
};
}

View File

@@ -0,0 +1,237 @@
import { diffArrays } from 'diff';
import type {
CharPosition,
CompareChangeSummary,
CompareRectangle,
CompareTextChange,
CompareTextItem,
CompareWordToken,
} from '../types.ts';
interface WordToken {
word: string;
compareWord: string;
rect: CompareRectangle;
}
function getCharMap(line: CompareTextItem): CharPosition[] {
if (line.charMap && line.charMap.length === line.normalizedText.length) {
return line.charMap;
}
const charWidth = line.rect.width / Math.max(line.normalizedText.length, 1);
return Array.from({ length: line.normalizedText.length }, (_, i) => ({
x: line.rect.x + i * charWidth,
width: charWidth,
}));
}
function splitLineIntoWords(line: CompareTextItem): WordToken[] {
if (line.wordTokens && line.wordTokens.length > 0) {
return line.wordTokens.map((token: CompareWordToken) => ({
word: token.word,
compareWord: token.compareWord,
rect: token.rect,
}));
}
const words = line.normalizedText.split(/\s+/).filter(Boolean);
if (words.length === 0) return [];
const charMap = getCharMap(line);
let offset = 0;
return words.map((word) => {
const startIndex = line.normalizedText.indexOf(word, offset);
const endIndex = startIndex + word.length - 1;
offset = startIndex + word.length;
const startChar = charMap[startIndex];
const endChar = charMap[endIndex];
if (!startChar || !endChar) {
const charWidth =
line.rect.width / Math.max(line.normalizedText.length, 1);
return {
word,
compareWord: word.toLowerCase(),
rect: {
x: line.rect.x + startIndex * charWidth,
y: line.rect.y,
width: word.length * charWidth,
height: line.rect.height,
},
};
}
const x = startChar.x;
const w = endChar.x + endChar.width - startChar.x;
return {
word,
compareWord: word.toLowerCase(),
rect: { x, y: line.rect.y, width: w, height: line.rect.height },
};
});
}
function groupAdjacentRects(rects: CompareRectangle[]): CompareRectangle[] {
if (rects.length === 0) return [];
const sorted = [...rects].sort((a, b) => a.y - b.y || a.x - b.x);
const groups: CompareRectangle[][] = [[sorted[0]]];
for (let i = 1; i < sorted.length; i++) {
const prev = groups[groups.length - 1];
const lastRect = prev[prev.length - 1];
const curr = sorted[i];
const sameLine =
Math.abs(curr.y - lastRect.y) < Math.max(lastRect.height * 0.6, 4);
const close = curr.x <= lastRect.x + lastRect.width + lastRect.height * 2;
if (sameLine && close) {
prev.push(curr);
} else {
groups.push([curr]);
}
}
return groups.map((group) => {
const minX = Math.min(...group.map((r) => r.x));
const minY = Math.min(...group.map((r) => r.y));
const maxX = Math.max(...group.map((r) => r.x + r.width));
const maxY = Math.max(...group.map((r) => r.y + r.height));
return { x: minX, y: minY, width: maxX - minX, height: maxY - minY };
});
}
function collapseWords(words: WordToken[]) {
return words.map((word) => word.compareWord).join('');
}
function areEquivalentIgnoringWordBreaks(
beforeWords: WordToken[],
afterWords: WordToken[]
) {
if (beforeWords.length === 0 || afterWords.length === 0) {
return false;
}
return collapseWords(beforeWords) === collapseWords(afterWords);
}
function createWordChange(
changes: CompareTextChange[],
type: CompareTextChange['type'],
beforeWords: WordToken[],
afterWords: WordToken[]
) {
const beforeText = beforeWords.map((w) => w.word).join(' ');
const afterText = afterWords.map((w) => w.word).join(' ');
if (!beforeText && !afterText) return;
const id = `${type}-${changes.length}`;
const beforeRects = groupAdjacentRects(beforeWords.map((w) => w.rect));
const afterRects = groupAdjacentRects(afterWords.map((w) => w.rect));
if (type === 'modified') {
changes.push({
id,
type,
description: `Replaced "${beforeText}" with "${afterText}"`,
beforeText,
afterText,
beforeRects,
afterRects,
});
} else if (type === 'removed') {
changes.push({
id,
type,
description: `Removed "${beforeText}"`,
beforeText,
afterText: '',
beforeRects,
afterRects: [],
});
} else {
changes.push({
id,
type,
description: `Added "${afterText}"`,
beforeText: '',
afterText,
beforeRects: [],
afterRects,
});
}
}
function toSummary(changes: CompareTextChange[]): CompareChangeSummary {
return changes.reduce(
(summary, change) => {
if (change.type === 'added') summary.added += 1;
if (change.type === 'removed') summary.removed += 1;
if (change.type === 'modified') summary.modified += 1;
return summary;
},
{ added: 0, removed: 0, modified: 0 }
);
}
export function diffTextRuns(
beforeItems: CompareTextItem[],
afterItems: CompareTextItem[]
) {
const beforeWords = beforeItems.flatMap(splitLineIntoWords);
const afterWords = afterItems.flatMap(splitLineIntoWords);
const rawChanges = diffArrays(
beforeWords.map((w) => w.compareWord),
afterWords.map((w) => w.compareWord)
);
const changes: CompareTextChange[] = [];
let beforeIndex = 0;
let afterIndex = 0;
for (let i = 0; i < rawChanges.length; i++) {
const change = rawChanges[i];
const count = change.value.length;
if (change.removed) {
const removedTokens = beforeWords.slice(beforeIndex, beforeIndex + count);
beforeIndex += count;
const next = rawChanges[i + 1];
if (next?.added) {
const addedTokens = afterWords.slice(
afterIndex,
afterIndex + next.value.length
);
afterIndex += next.value.length;
if (areEquivalentIgnoringWordBreaks(removedTokens, addedTokens)) {
i++;
continue;
}
createWordChange(changes, 'modified', removedTokens, addedTokens);
i++;
} else {
createWordChange(changes, 'removed', removedTokens, []);
}
continue;
}
if (change.added) {
const addedTokens = afterWords.slice(afterIndex, afterIndex + count);
afterIndex += count;
createWordChange(changes, 'added', [], addedTokens);
continue;
}
beforeIndex += count;
afterIndex += count;
}
return { changes, summary: toSummary(changes) };
}

View File

@@ -0,0 +1,520 @@
import * as pdfjsLib from 'pdfjs-dist';
import type {
ComparePageModel,
CompareTextItem,
CharPosition,
CompareWordToken,
} from '../types.ts';
import {
joinCompareTextItems,
normalizeCompareText,
} from './text-normalization.ts';
type PageTextItem = {
str: string;
width: number;
height: number;
transform: number[];
dir: string;
fontName: string;
hasEOL: boolean;
};
type TextStyles = Record<string, { fontFamily?: string }>;
const measurementCanvas =
typeof document !== 'undefined' ? document.createElement('canvas') : null;
const measurementContext = measurementCanvas
? measurementCanvas.getContext('2d')
: null;
const textMeasurementCache: Map<string, number> | null = measurementContext
? new Map()
: null;
let lastMeasurementFont = '';
const DEFAULT_CHAR_WIDTH = 1;
const DEFAULT_SPACE_WIDTH = 0.33;
function shouldJoinTokenWithPrevious(previous: string, current: string) {
if (!previous) return false;
if (/^[,.;:!?%)\]}]/.test(current)) return true;
if (/^[''"']/u.test(current)) return true;
if (/[([{/"'-]$/u.test(previous)) return true;
return false;
}
function measureTextWidth(fontSpec: string, text: string): number {
if (!measurementContext) {
if (!text) return 0;
if (text === ' ') return DEFAULT_SPACE_WIDTH;
return text.length * DEFAULT_CHAR_WIDTH;
}
if (lastMeasurementFont !== fontSpec) {
measurementContext.font = fontSpec;
lastMeasurementFont = fontSpec;
}
const key = `${fontSpec}|${text}`;
const cached = textMeasurementCache?.get(key);
if (cached !== undefined) {
return cached;
}
const width = measurementContext.measureText(text).width || 0;
textMeasurementCache?.set(key, width);
return width;
}
function buildItemWordTokens(
viewport: pdfjsLib.PageViewport,
item: PageTextItem,
fallbackRect: CompareTextItem['rect'],
styles: TextStyles
): CompareWordToken[] {
const rawText = item.str || '';
if (!rawText.trim()) {
return [];
}
const totalLen = Math.max(rawText.length, 1);
const textStyle = item.fontName ? styles[item.fontName] : undefined;
const fontFamily = textStyle?.fontFamily ?? 'sans-serif';
const fontScale = Math.max(
0.5,
Math.hypot(item.transform[0], item.transform[1]) || 0
);
const fontSpec = `${fontScale}px ${fontFamily}`;
const weights: number[] = new Array(totalLen);
let runningText = '';
let previousAdvance = 0;
for (let index = 0; index < totalLen; index += 1) {
runningText += rawText[index];
const advance = measureTextWidth(fontSpec, runningText);
let width = advance - previousAdvance;
if (!Number.isFinite(width) || width <= 0) {
width = rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH;
}
weights[index] = width;
previousAdvance = advance;
}
if (!Number.isFinite(previousAdvance) || previousAdvance <= 0) {
for (let index = 0; index < totalLen; index += 1) {
weights[index] =
rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH;
}
}
const prefix: number[] = new Array(totalLen + 1);
prefix[0] = 0;
for (let index = 0; index < totalLen; index += 1) {
prefix[index + 1] = prefix[index] + weights[index];
}
const totalWeight = prefix[totalLen] || 1;
const rawX = item.transform[4];
const rawY = item.transform[5];
const transformed = [
viewport.convertToViewportPoint(rawX, rawY),
viewport.convertToViewportPoint(rawX + item.width, rawY),
viewport.convertToViewportPoint(rawX, rawY + item.height),
viewport.convertToViewportPoint(rawX + item.width, rawY + item.height),
];
const xs = transformed.map(([x]) => x);
const ys = transformed.map(([, y]) => y);
const left = Math.min(...xs);
const right = Math.max(...xs);
const top = Math.min(...ys);
const bottom = Math.max(...ys);
const [baselineStart, baselineEnd, verticalEnd] = transformed;
const baselineVector: [number, number] = [
baselineEnd[0] - baselineStart[0],
baselineEnd[1] - baselineStart[1],
];
const verticalVector: [number, number] = [
verticalEnd[0] - baselineStart[0],
verticalEnd[1] - baselineStart[1],
];
const hasOrientationVectors =
Math.hypot(baselineVector[0], baselineVector[1]) > 1e-6 &&
Math.hypot(verticalVector[0], verticalVector[1]) > 1e-6;
const tokens: CompareWordToken[] = [];
const wordRegex = /\S+/gu;
let match: RegExpExecArray | null;
let previousEnd = 0;
while ((match = wordRegex.exec(rawText)) !== null) {
const tokenText = match[0];
const normalizedWord = normalizeCompareText(tokenText);
if (!normalizedWord) {
previousEnd = match.index + tokenText.length;
continue;
}
const startIndex = match.index;
const endIndex = startIndex + tokenText.length;
const relStart = prefix[startIndex] / totalWeight;
const relEnd = prefix[endIndex] / totalWeight;
let wordLeft: number;
let wordRight: number;
let wordTop: number;
let wordBottom: number;
if (hasOrientationVectors) {
const segStart: [number, number] = [
baselineStart[0] + baselineVector[0] * relStart,
baselineStart[1] + baselineVector[1] * relStart,
];
const segEnd: [number, number] = [
baselineStart[0] + baselineVector[0] * relEnd,
baselineStart[1] + baselineVector[1] * relEnd,
];
const cornerPoints: Array<[number, number]> = [
segStart,
[segStart[0] + verticalVector[0], segStart[1] + verticalVector[1]],
[segEnd[0] + verticalVector[0], segEnd[1] + verticalVector[1]],
segEnd,
];
wordLeft = Math.min(...cornerPoints.map(([x]) => x));
wordRight = Math.max(...cornerPoints.map(([x]) => x));
wordTop = Math.min(...cornerPoints.map(([, y]) => y));
wordBottom = Math.max(...cornerPoints.map(([, y]) => y));
} else {
const segLeft = left + (right - left) * relStart;
const segRight = left + (right - left) * relEnd;
wordLeft = Math.min(segLeft, segRight);
wordRight = Math.max(segLeft, segRight);
wordTop = top;
wordBottom = bottom;
}
const width = Math.max(wordRight - wordLeft, 1);
const height = Math.max(wordBottom - wordTop, fallbackRect.height);
const gapText = rawText.slice(previousEnd, startIndex);
const previousToken = tokens[tokens.length - 1];
tokens.push({
word: normalizedWord,
compareWord: normalizedWord.toLowerCase(),
rect: {
x: Number.isFinite(wordLeft) ? wordLeft : fallbackRect.x,
y: Number.isFinite(wordTop) ? wordTop : fallbackRect.y,
width,
height,
},
joinsWithPrevious:
(gapText.length > 0 && !/\s/u.test(gapText)) ||
(previousToken
? shouldJoinTokenWithPrevious(previousToken.word, normalizedWord)
: false),
});
previousEnd = endIndex;
}
return tokens;
}
function toRect(
viewport: pdfjsLib.PageViewport,
item: PageTextItem,
index: number,
styles: TextStyles
) {
const normalizedText = normalizeCompareText(item.str);
const transformed = pdfjsLib.Util.transform(
viewport.transform,
item.transform
);
const width = Math.max(item.width * viewport.scale, 1);
const height = Math.max(
Math.abs(transformed[3]) || item.height * viewport.scale,
1
);
const x = transformed[4];
const y = transformed[5] - height;
const rect = {
x,
y,
width,
height,
};
return {
id: `${index}-${normalizedText}`,
text: item.str,
normalizedText,
rect,
wordTokens: buildItemWordTokens(viewport, item, rect, styles),
} satisfies CompareTextItem;
}
export function sortCompareTextItems(items: CompareTextItem[]) {
return [...items].sort((left, right) => {
const lineTolerance = Math.max(
Math.min(left.rect.height, right.rect.height) * 0.6,
4
);
const topDiff = left.rect.y - right.rect.y;
if (Math.abs(topDiff) > lineTolerance) {
return topDiff;
}
const xDiff = left.rect.x - right.rect.x;
if (Math.abs(xDiff) > 1) {
return xDiff;
}
return left.id.localeCompare(right.id);
});
}
function averageCharacterWidth(item: CompareTextItem) {
const compactText = item.normalizedText.replace(/\s+/g, '');
return item.rect.width / Math.max(compactText.length, 1);
}
function shouldInsertSpaceBetweenItems(
left: CompareTextItem,
right: CompareTextItem
) {
if (!left.normalizedText || !right.normalizedText) {
return false;
}
if (/^[,.;:!?%)\]}]/.test(right.normalizedText)) {
return false;
}
if (/^[''"']/u.test(right.normalizedText)) {
return false;
}
if (/[([{/"'-]$/u.test(left.normalizedText)) {
return false;
}
const gap = right.rect.x - (left.rect.x + left.rect.width);
if (gap <= 0) {
return false;
}
const leftWidth = averageCharacterWidth(left);
const rightWidth = averageCharacterWidth(right);
const threshold = Math.max(Math.min(leftWidth, rightWidth) * 0.45, 1.5);
return gap >= threshold;
}
function mergeLineText(lineItems: CompareTextItem[]): {
text: string;
charMap: CharPosition[];
} {
if (lineItems.length === 0) {
return { text: '', charMap: [] };
}
const charMap: CharPosition[] = [];
function pushFragChars(frag: CompareTextItem) {
const fragText = frag.normalizedText;
const fragCharWidth = frag.rect.width / Math.max(fragText.length, 1);
for (let ci = 0; ci < fragText.length; ci++) {
charMap.push({
x: frag.rect.x + ci * fragCharWidth,
width: fragCharWidth,
});
}
}
let merged = lineItems[0].normalizedText;
pushFragChars(lineItems[0]);
for (let index = 1; index < lineItems.length; index += 1) {
const previous = lineItems[index - 1];
const current = lineItems[index];
if (shouldInsertSpaceBetweenItems(previous, current)) {
const gap = current.rect.x - (previous.rect.x + previous.rect.width);
charMap.push({
x: previous.rect.x + previous.rect.width,
width: Math.max(gap, 1),
});
merged += ` ${current.normalizedText}`;
} else {
merged += current.normalizedText;
}
pushFragChars(current);
}
return { text: normalizeCompareText(merged), charMap };
}
function mergeWordTokenRects(
left: CompareWordToken,
right: CompareWordToken
): CompareWordToken {
const minX = Math.min(left.rect.x, right.rect.x);
const minY = Math.min(left.rect.y, right.rect.y);
const maxX = Math.max(
left.rect.x + left.rect.width,
right.rect.x + right.rect.width
);
const maxY = Math.max(
left.rect.y + left.rect.height,
right.rect.y + right.rect.height
);
return {
word: `${left.word}${right.word}`,
compareWord: `${left.compareWord}${right.compareWord}`,
rect: {
x: minX,
y: minY,
width: maxX - minX,
height: maxY - minY,
},
};
}
function buildMergedWordTokens(lineItems: CompareTextItem[]) {
if (
!lineItems.some((item) => item.wordTokens && item.wordTokens.length > 0)
) {
return undefined;
}
const mergedTokens: CompareWordToken[] = [];
let previousItem: CompareTextItem | null = null;
for (const item of lineItems) {
const itemTokens =
item.wordTokens && item.wordTokens.length > 0
? item.wordTokens
: [
{
word: item.normalizedText,
compareWord: item.normalizedText.toLowerCase(),
rect: item.rect,
} satisfies CompareWordToken,
];
itemTokens.forEach((token, tokenIndex) => {
const joinsAcrossItems =
tokenIndex === 0 && previousItem
? !shouldInsertSpaceBetweenItems(previousItem, item)
: false;
const shouldJoin =
mergedTokens.length > 0 &&
(tokenIndex > 0 ? Boolean(token.joinsWithPrevious) : joinsAcrossItems);
if (shouldJoin) {
mergedTokens[mergedTokens.length - 1] = mergeWordTokenRects(
mergedTokens[mergedTokens.length - 1],
token
);
} else {
mergedTokens.push({
word: token.word,
compareWord: token.compareWord,
rect: token.rect,
});
}
});
previousItem = item;
}
return mergedTokens;
}
export function mergeIntoLines(
sortedItems: CompareTextItem[]
): CompareTextItem[] {
if (sortedItems.length === 0) return [];
const lines: CompareTextItem[][] = [];
let currentLine: CompareTextItem[] = [sortedItems[0]];
for (let i = 1; i < sortedItems.length; i++) {
const anchor = currentLine[0];
const curr = sortedItems[i];
const lineTolerance = Math.max(
Math.min(anchor.rect.height, curr.rect.height) * 0.6,
4
);
if (Math.abs(curr.rect.y - anchor.rect.y) <= lineTolerance) {
currentLine.push(curr);
} else {
lines.push(currentLine);
currentLine = [curr];
}
}
lines.push(currentLine);
return lines.map((lineItems, lineIndex) => {
const { text: normalizedText, charMap } = mergeLineText(lineItems);
const minX = Math.min(...lineItems.map((item) => item.rect.x));
const minY = Math.min(...lineItems.map((item) => item.rect.y));
const maxX = Math.max(
...lineItems.map((item) => item.rect.x + item.rect.width)
);
const maxY = Math.max(
...lineItems.map((item) => item.rect.y + item.rect.height)
);
return {
id: `line-${lineIndex}`,
text: lineItems.map((item) => item.text).join(' '),
normalizedText,
rect: {
x: minX,
y: minY,
width: maxX - minX,
height: maxY - minY,
},
fragments: lineItems,
charMap,
wordTokens: buildMergedWordTokens(lineItems),
};
});
}
export async function extractPageModel(
page: pdfjsLib.PDFPageProxy,
viewport: pdfjsLib.PageViewport
): Promise<ComparePageModel> {
const textContent = await page.getTextContent({
disableCombineTextItems: true,
});
const styles = textContent.styles ?? {};
const rawItems = sortCompareTextItems(
textContent.items
.filter((item): item is PageTextItem => 'str' in item)
.map((item, index) => toRect(viewport, item, index, styles))
.filter((item) => item.normalizedText.length > 0)
);
const textItems = mergeIntoLines(rawItems);
return {
pageNumber: page.pageNumber,
width: viewport.width,
height: viewport.height,
textItems,
plainText: joinCompareTextItems(textItems),
hasText: textItems.length > 0,
source: 'pdfjs',
};
}

View File

@@ -0,0 +1,76 @@
import Tesseract from 'tesseract.js';
import type { ComparePageModel, CompareTextItem } from '../types.ts';
import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts';
import {
joinCompareTextItems,
normalizeCompareText,
} from './text-normalization.ts';
type OcrWord = {
text: string;
bbox: {
x0: number;
y0: number;
x1: number;
y1: number;
};
};
export async function recognizePageCanvas(
canvas: HTMLCanvasElement,
language: string,
onProgress?: (status: string, progress: number) => void
): Promise<ComparePageModel> {
const result = await Tesseract.recognize(canvas, language, {
logger(message) {
onProgress?.(message.status, message.progress || 0);
},
});
const ocrData = result.data as unknown as { words?: OcrWord[] };
const words = ((ocrData.words || []) as OcrWord[])
.map((word, index) => {
const normalizedText = normalizeCompareText(word.text || '');
if (!normalizedText) return null;
const item: CompareTextItem = {
id: `ocr-${index}-${normalizedText}`,
text: word.text,
normalizedText,
rect: {
x: word.bbox.x0,
y: word.bbox.y0,
width: Math.max(word.bbox.x1 - word.bbox.x0, 1),
height: Math.max(word.bbox.y1 - word.bbox.y0, 1),
},
wordTokens: [
{
word: normalizedText,
compareWord: normalizedText.toLowerCase(),
rect: {
x: word.bbox.x0,
y: word.bbox.y0,
width: Math.max(word.bbox.x1 - word.bbox.x0, 1),
height: Math.max(word.bbox.y1 - word.bbox.y0, 1),
},
},
],
};
return item;
})
.filter((word): word is CompareTextItem => Boolean(word));
const mergedItems = mergeIntoLines(sortCompareTextItems(words));
return {
pageNumber: 0,
width: canvas.width,
height: canvas.height,
textItems: mergedItems,
plainText: joinCompareTextItems(mergedItems),
hasText: mergedItems.length > 0,
source: 'ocr',
};
}

View File

@@ -0,0 +1,61 @@
import * as pdfjsLib from 'pdfjs-dist';
import type { ComparePageSignature, CompareTextItem } from '../types.ts';
import {
joinNormalizedText,
normalizeCompareText,
} from './text-normalization.ts';
type SignatureTextItem = {
str: string;
dir: string;
transform: number[];
width: number;
height: number;
fontName: string;
hasEOL: boolean;
};
function tokenToItem(token: string, index: number): CompareTextItem {
return {
id: `token-${index}-${token}`,
text: token,
normalizedText: token,
rect: { x: 0, y: 0, width: 0, height: 0 },
};
}
export async function extractPageSignature(
pdfDoc: pdfjsLib.PDFDocumentProxy,
pageNumber: number
): Promise<ComparePageSignature> {
const page = await pdfDoc.getPage(pageNumber);
const textContent = await page.getTextContent();
const tokens = textContent.items
.filter((item): item is SignatureTextItem => 'str' in item)
.map((item) => normalizeCompareText(item.str))
.filter(Boolean);
const limitedTokens = tokens.slice(0, 500);
return {
pageNumber,
plainText: joinNormalizedText(limitedTokens),
hasText: limitedTokens.length > 0,
tokenItems: limitedTokens.map((token, index) => tokenToItem(token, index)),
};
}
export async function extractDocumentSignatures(
pdfDoc: pdfjsLib.PDFDocumentProxy,
onProgress?: (pageNumber: number, totalPages: number) => void
) {
const signatures: ComparePageSignature[] = [];
for (let pageNumber = 1; pageNumber <= pdfDoc.numPages; pageNumber += 1) {
onProgress?.(pageNumber, pdfDoc.numPages);
signatures.push(await extractPageSignature(pdfDoc, pageNumber));
}
return signatures;
}

View File

@@ -0,0 +1,122 @@
import type { ComparePagePair, ComparePageSignature } from '../types.ts';
function tokenize(text: string) {
return new Set(text.split(/\s+/).filter(Boolean));
}
function similarityScore(
left: ComparePageSignature,
right: ComparePageSignature
) {
if (!left.hasText && !right.hasText) {
return left.pageNumber === right.pageNumber ? 0.7 : 0.35;
}
if (!left.hasText || !right.hasText) {
return 0.08;
}
const leftTokens = tokenize(left.plainText);
const rightTokens = tokenize(right.plainText);
const union = new Set([...leftTokens, ...rightTokens]);
let intersectionCount = 0;
leftTokens.forEach((token) => {
if (rightTokens.has(token)) intersectionCount += 1;
});
const jaccard = union.size === 0 ? 0 : intersectionCount / union.size;
const positionalBias = left.pageNumber === right.pageNumber ? 0.1 : 0;
return Math.min(jaccard + positionalBias, 1);
}
export function pairPages(
leftPages: ComparePageSignature[],
rightPages: ComparePageSignature[]
) {
const insertionCost = 0.8;
const rowCount = leftPages.length + 1;
const colCount = rightPages.length + 1;
const dp = Array.from({ length: rowCount }, () =>
Array<number>(colCount).fill(0)
);
const backtrack = Array.from({ length: rowCount }, () =>
Array<'match' | 'left' | 'right'>(colCount).fill('match')
);
for (let i = 1; i < rowCount; i += 1) {
dp[i][0] = i * insertionCost;
backtrack[i][0] = 'left';
}
for (let j = 1; j < colCount; j += 1) {
dp[0][j] = j * insertionCost;
backtrack[0][j] = 'right';
}
for (let i = 1; i < rowCount; i += 1) {
for (let j = 1; j < colCount; j += 1) {
const similarity = similarityScore(leftPages[i - 1], rightPages[j - 1]);
const matchCost = dp[i - 1][j - 1] + (1 - similarity);
const leftCost = dp[i - 1][j] + insertionCost;
const rightCost = dp[i][j - 1] + insertionCost;
const minCost = Math.min(matchCost, leftCost, rightCost);
dp[i][j] = minCost;
if (minCost === matchCost) {
backtrack[i][j] = 'match';
} else if (minCost === leftCost) {
backtrack[i][j] = 'left';
} else {
backtrack[i][j] = 'right';
}
}
}
const pairs: ComparePagePair[] = [];
let i = leftPages.length;
let j = rightPages.length;
while (i > 0 || j > 0) {
const direction = backtrack[i][j];
if (i > 0 && j > 0 && direction === 'match') {
const confidence = similarityScore(leftPages[i - 1], rightPages[j - 1]);
pairs.push({
pairIndex: 0,
leftPageNumber: leftPages[i - 1].pageNumber,
rightPageNumber: rightPages[j - 1].pageNumber,
confidence,
});
i -= 1;
j -= 1;
continue;
}
if (i > 0 && (j === 0 || direction === 'left')) {
pairs.push({
pairIndex: 0,
leftPageNumber: leftPages[i - 1].pageNumber,
rightPageNumber: null,
confidence: 0,
});
i -= 1;
continue;
}
if (j > 0) {
pairs.push({
pairIndex: 0,
leftPageNumber: null,
rightPageNumber: rightPages[j - 1].pageNumber,
confidence: 0,
});
j -= 1;
}
}
return pairs
.reverse()
.map((pair, index) => ({ ...pair, pairIndex: index + 1 }));
}

View File

@@ -0,0 +1,64 @@
import type { CompareTextItem } from '../types.ts';
export function normalizeCompareText(text: string) {
return text
.normalize('NFKC')
.replace(/[\u0000-\u001F\u007F-\u009F]/g, ' ')
.replace(/[\u{E000}-\u{F8FF}]/gu, ' ')
.replace(/\s+/g, ' ')
.trim();
}
function shouldAppendWithoutSpace(current: string, next: string) {
if (!current) return true;
if (/^[,.;:!?%)\]}]/.test(next)) return true;
if (/^["']$/.test(next)) return true;
if (/^[']/u.test(next)) return true;
if (/[([{/"'-]$/u.test(current)) return true;
return false;
}
export function joinNormalizedText(tokens: string[]) {
return tokens.reduce((result, token) => {
if (!token) return result;
if (shouldAppendWithoutSpace(result, token)) {
return `${result}${token}`;
}
return `${result} ${token}`;
}, '');
}
export function joinCompareTextItems(items: CompareTextItem[]) {
return joinNormalizedText(items.map((item) => item.normalizedText));
}
export function isLowQualityExtractedText(text: string) {
const normalized = normalizeCompareText(text);
if (!normalized) return true;
const tokens = normalized.split(/\s+/).filter(Boolean);
const visibleCharacters = Array.from(normalized).filter(
(character) => character.trim().length > 0
);
const alphaNumericCount = visibleCharacters.filter((character) =>
/[\p{L}\p{N}]/u.test(character)
).length;
const symbolCount = visibleCharacters.length - alphaNumericCount;
const tokenWithAlphaNumericCount = tokens.filter((token) =>
/[\p{L}\p{N}]/u.test(token)
).length;
if (alphaNumericCount === 0) return true;
if (
visibleCharacters.length >= 12 &&
alphaNumericCount / visibleCharacters.length < 0.45 &&
symbolCount / visibleCharacters.length > 0.35
) {
return true;
}
if (tokens.length >= 6 && tokenWithAlphaNumericCount / tokens.length < 0.6) {
return true;
}
return false;
}

View File

@@ -0,0 +1,134 @@
import pixelmatch from 'pixelmatch';
import type { CompareVisualDiff } from '../types.ts';
type FocusRegion = {
x: number;
y: number;
width: number;
height: number;
};
function createCanvas(width: number, height: number) {
const canvas = document.createElement('canvas');
canvas.width = width;
canvas.height = height;
return canvas;
}
function drawNormalized(
sourceCanvas: HTMLCanvasElement,
targetCanvas: HTMLCanvasElement
) {
const context = targetCanvas.getContext('2d');
if (!context) {
throw new Error('Could not create comparison canvas context.');
}
context.fillStyle = '#ffffff';
context.fillRect(0, 0, targetCanvas.width, targetCanvas.height);
const offsetX = Math.floor((targetCanvas.width - sourceCanvas.width) / 2);
const offsetY = Math.floor((targetCanvas.height - sourceCanvas.height) / 2);
context.drawImage(sourceCanvas, offsetX, offsetY);
}
export function renderVisualDiff(
canvas1: HTMLCanvasElement,
canvas2: HTMLCanvasElement,
outputCanvas: HTMLCanvasElement,
focusRegion?: FocusRegion
): CompareVisualDiff {
const width = Math.max(canvas1.width, canvas2.width, 1);
const height = Math.max(canvas1.height, canvas2.height, 1);
const normalizedCanvas1 = createCanvas(width, height);
const normalizedCanvas2 = createCanvas(width, height);
drawNormalized(canvas1, normalizedCanvas1);
drawNormalized(canvas2, normalizedCanvas2);
outputCanvas.width = width;
outputCanvas.height = height;
const context1 = normalizedCanvas1.getContext('2d');
const context2 = normalizedCanvas2.getContext('2d');
const outputContext = outputCanvas.getContext('2d');
if (!context1 || !context2 || !outputContext) {
throw new Error('Could not create visual diff context.');
}
const image1 = context1.getImageData(0, 0, width, height);
const image2 = context2.getImageData(0, 0, width, height);
const diffImage = outputContext.createImageData(width, height);
const mismatchPixels = pixelmatch(
image1.data,
image2.data,
diffImage.data,
width,
height,
{
threshold: 0.12,
includeAA: false,
alpha: 0.2,
diffMask: false,
diffColor: [239, 68, 68],
diffColorAlt: [34, 197, 94],
}
);
const overlayCanvas = createCanvas(width, height);
const overlayContext = overlayCanvas.getContext('2d');
if (!overlayContext) {
throw new Error('Could not create visual diff overlay context.');
}
overlayContext.putImageData(diffImage, 0, 0);
const region = focusRegion
? {
x: Math.max(Math.floor(focusRegion.x), 0),
y: Math.max(Math.floor(focusRegion.y), 0),
width: Math.min(Math.ceil(focusRegion.width), width),
height: Math.min(Math.ceil(focusRegion.height), height),
}
: { x: 0, y: 0, width, height };
outputCanvas.width = Math.max(region.width, 1);
outputCanvas.height = Math.max(region.height, 1);
outputContext.fillStyle = '#ffffff';
outputContext.fillRect(0, 0, outputCanvas.width, outputCanvas.height);
outputContext.drawImage(
normalizedCanvas2,
region.x,
region.y,
region.width,
region.height,
0,
0,
outputCanvas.width,
outputCanvas.height
);
outputContext.globalAlpha = 0.9;
outputContext.drawImage(
overlayCanvas,
region.x,
region.y,
region.width,
region.height,
0,
0,
outputCanvas.width,
outputCanvas.height
);
outputContext.globalAlpha = 1;
return {
mismatchPixels,
mismatchRatio: mismatchPixels / Math.max(width * height, 1),
hasDiff: mismatchPixels > 0,
};
}