Files
bentopdf/src/js/compare/engine/extract-page-model.ts
alam00000 1d68691331 refactor: update PDF comparison types and enhance UI for better usability
- Refactored CompareState to import from a centralized type definition.
- Enhanced the compare-pdfs.html layout with improved styles for overlay and side-by-side modes.
- Added new CSS styles for various UI components including panels, buttons, and highlights.
- Implemented a new sidebar for displaying change summaries and filters.
- Created unit tests for text comparison logic, including diffing text runs and page pairing.
- Added tests for text normalization functions to ensure proper handling of punctuation and character normalization.
2026-03-08 23:55:33 +05:30

521 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import * as pdfjsLib from 'pdfjs-dist';
import type {
ComparePageModel,
CompareTextItem,
CharPosition,
CompareWordToken,
} from '../types.ts';
import {
joinCompareTextItems,
normalizeCompareText,
} from './text-normalization.ts';
type PageTextItem = {
str: string;
width: number;
height: number;
transform: number[];
dir: string;
fontName: string;
hasEOL: boolean;
};
type TextStyles = Record<string, { fontFamily?: string }>;
const measurementCanvas =
typeof document !== 'undefined' ? document.createElement('canvas') : null;
const measurementContext = measurementCanvas
? measurementCanvas.getContext('2d')
: null;
const textMeasurementCache: Map<string, number> | null = measurementContext
? new Map()
: null;
let lastMeasurementFont = '';
const DEFAULT_CHAR_WIDTH = 1;
const DEFAULT_SPACE_WIDTH = 0.33;
function shouldJoinTokenWithPrevious(previous: string, current: string) {
if (!previous) return false;
if (/^[,.;:!?%)\]}]/.test(current)) return true;
if (/^[''"']/u.test(current)) return true;
if (/[([{/"'-]$/u.test(previous)) return true;
return false;
}
function measureTextWidth(fontSpec: string, text: string): number {
if (!measurementContext) {
if (!text) return 0;
if (text === ' ') return DEFAULT_SPACE_WIDTH;
return text.length * DEFAULT_CHAR_WIDTH;
}
if (lastMeasurementFont !== fontSpec) {
measurementContext.font = fontSpec;
lastMeasurementFont = fontSpec;
}
const key = `${fontSpec}|${text}`;
const cached = textMeasurementCache?.get(key);
if (cached !== undefined) {
return cached;
}
const width = measurementContext.measureText(text).width || 0;
textMeasurementCache?.set(key, width);
return width;
}
function buildItemWordTokens(
viewport: pdfjsLib.PageViewport,
item: PageTextItem,
fallbackRect: CompareTextItem['rect'],
styles: TextStyles
): CompareWordToken[] {
const rawText = item.str || '';
if (!rawText.trim()) {
return [];
}
const totalLen = Math.max(rawText.length, 1);
const textStyle = item.fontName ? styles[item.fontName] : undefined;
const fontFamily = textStyle?.fontFamily ?? 'sans-serif';
const fontScale = Math.max(
0.5,
Math.hypot(item.transform[0], item.transform[1]) || 0
);
const fontSpec = `${fontScale}px ${fontFamily}`;
const weights: number[] = new Array(totalLen);
let runningText = '';
let previousAdvance = 0;
for (let index = 0; index < totalLen; index += 1) {
runningText += rawText[index];
const advance = measureTextWidth(fontSpec, runningText);
let width = advance - previousAdvance;
if (!Number.isFinite(width) || width <= 0) {
width = rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH;
}
weights[index] = width;
previousAdvance = advance;
}
if (!Number.isFinite(previousAdvance) || previousAdvance <= 0) {
for (let index = 0; index < totalLen; index += 1) {
weights[index] =
rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH;
}
}
const prefix: number[] = new Array(totalLen + 1);
prefix[0] = 0;
for (let index = 0; index < totalLen; index += 1) {
prefix[index + 1] = prefix[index] + weights[index];
}
const totalWeight = prefix[totalLen] || 1;
const rawX = item.transform[4];
const rawY = item.transform[5];
const transformed = [
viewport.convertToViewportPoint(rawX, rawY),
viewport.convertToViewportPoint(rawX + item.width, rawY),
viewport.convertToViewportPoint(rawX, rawY + item.height),
viewport.convertToViewportPoint(rawX + item.width, rawY + item.height),
];
const xs = transformed.map(([x]) => x);
const ys = transformed.map(([, y]) => y);
const left = Math.min(...xs);
const right = Math.max(...xs);
const top = Math.min(...ys);
const bottom = Math.max(...ys);
const [baselineStart, baselineEnd, verticalEnd] = transformed;
const baselineVector: [number, number] = [
baselineEnd[0] - baselineStart[0],
baselineEnd[1] - baselineStart[1],
];
const verticalVector: [number, number] = [
verticalEnd[0] - baselineStart[0],
verticalEnd[1] - baselineStart[1],
];
const hasOrientationVectors =
Math.hypot(baselineVector[0], baselineVector[1]) > 1e-6 &&
Math.hypot(verticalVector[0], verticalVector[1]) > 1e-6;
const tokens: CompareWordToken[] = [];
const wordRegex = /\S+/gu;
let match: RegExpExecArray | null;
let previousEnd = 0;
while ((match = wordRegex.exec(rawText)) !== null) {
const tokenText = match[0];
const normalizedWord = normalizeCompareText(tokenText);
if (!normalizedWord) {
previousEnd = match.index + tokenText.length;
continue;
}
const startIndex = match.index;
const endIndex = startIndex + tokenText.length;
const relStart = prefix[startIndex] / totalWeight;
const relEnd = prefix[endIndex] / totalWeight;
let wordLeft: number;
let wordRight: number;
let wordTop: number;
let wordBottom: number;
if (hasOrientationVectors) {
const segStart: [number, number] = [
baselineStart[0] + baselineVector[0] * relStart,
baselineStart[1] + baselineVector[1] * relStart,
];
const segEnd: [number, number] = [
baselineStart[0] + baselineVector[0] * relEnd,
baselineStart[1] + baselineVector[1] * relEnd,
];
const cornerPoints: Array<[number, number]> = [
segStart,
[segStart[0] + verticalVector[0], segStart[1] + verticalVector[1]],
[segEnd[0] + verticalVector[0], segEnd[1] + verticalVector[1]],
segEnd,
];
wordLeft = Math.min(...cornerPoints.map(([x]) => x));
wordRight = Math.max(...cornerPoints.map(([x]) => x));
wordTop = Math.min(...cornerPoints.map(([, y]) => y));
wordBottom = Math.max(...cornerPoints.map(([, y]) => y));
} else {
const segLeft = left + (right - left) * relStart;
const segRight = left + (right - left) * relEnd;
wordLeft = Math.min(segLeft, segRight);
wordRight = Math.max(segLeft, segRight);
wordTop = top;
wordBottom = bottom;
}
const width = Math.max(wordRight - wordLeft, 1);
const height = Math.max(wordBottom - wordTop, fallbackRect.height);
const gapText = rawText.slice(previousEnd, startIndex);
const previousToken = tokens[tokens.length - 1];
tokens.push({
word: normalizedWord,
compareWord: normalizedWord.toLowerCase(),
rect: {
x: Number.isFinite(wordLeft) ? wordLeft : fallbackRect.x,
y: Number.isFinite(wordTop) ? wordTop : fallbackRect.y,
width,
height,
},
joinsWithPrevious:
(gapText.length > 0 && !/\s/u.test(gapText)) ||
(previousToken
? shouldJoinTokenWithPrevious(previousToken.word, normalizedWord)
: false),
});
previousEnd = endIndex;
}
return tokens;
}
function toRect(
viewport: pdfjsLib.PageViewport,
item: PageTextItem,
index: number,
styles: TextStyles
) {
const normalizedText = normalizeCompareText(item.str);
const transformed = pdfjsLib.Util.transform(
viewport.transform,
item.transform
);
const width = Math.max(item.width * viewport.scale, 1);
const height = Math.max(
Math.abs(transformed[3]) || item.height * viewport.scale,
1
);
const x = transformed[4];
const y = transformed[5] - height;
const rect = {
x,
y,
width,
height,
};
return {
id: `${index}-${normalizedText}`,
text: item.str,
normalizedText,
rect,
wordTokens: buildItemWordTokens(viewport, item, rect, styles),
} satisfies CompareTextItem;
}
export function sortCompareTextItems(items: CompareTextItem[]) {
return [...items].sort((left, right) => {
const lineTolerance = Math.max(
Math.min(left.rect.height, right.rect.height) * 0.6,
4
);
const topDiff = left.rect.y - right.rect.y;
if (Math.abs(topDiff) > lineTolerance) {
return topDiff;
}
const xDiff = left.rect.x - right.rect.x;
if (Math.abs(xDiff) > 1) {
return xDiff;
}
return left.id.localeCompare(right.id);
});
}
function averageCharacterWidth(item: CompareTextItem) {
const compactText = item.normalizedText.replace(/\s+/g, '');
return item.rect.width / Math.max(compactText.length, 1);
}
function shouldInsertSpaceBetweenItems(
left: CompareTextItem,
right: CompareTextItem
) {
if (!left.normalizedText || !right.normalizedText) {
return false;
}
if (/^[,.;:!?%)\]}]/.test(right.normalizedText)) {
return false;
}
if (/^[''"']/u.test(right.normalizedText)) {
return false;
}
if (/[([{/"'-]$/u.test(left.normalizedText)) {
return false;
}
const gap = right.rect.x - (left.rect.x + left.rect.width);
if (gap <= 0) {
return false;
}
const leftWidth = averageCharacterWidth(left);
const rightWidth = averageCharacterWidth(right);
const threshold = Math.max(Math.min(leftWidth, rightWidth) * 0.45, 1.5);
return gap >= threshold;
}
function mergeLineText(lineItems: CompareTextItem[]): {
text: string;
charMap: CharPosition[];
} {
if (lineItems.length === 0) {
return { text: '', charMap: [] };
}
const charMap: CharPosition[] = [];
function pushFragChars(frag: CompareTextItem) {
const fragText = frag.normalizedText;
const fragCharWidth = frag.rect.width / Math.max(fragText.length, 1);
for (let ci = 0; ci < fragText.length; ci++) {
charMap.push({
x: frag.rect.x + ci * fragCharWidth,
width: fragCharWidth,
});
}
}
let merged = lineItems[0].normalizedText;
pushFragChars(lineItems[0]);
for (let index = 1; index < lineItems.length; index += 1) {
const previous = lineItems[index - 1];
const current = lineItems[index];
if (shouldInsertSpaceBetweenItems(previous, current)) {
const gap = current.rect.x - (previous.rect.x + previous.rect.width);
charMap.push({
x: previous.rect.x + previous.rect.width,
width: Math.max(gap, 1),
});
merged += ` ${current.normalizedText}`;
} else {
merged += current.normalizedText;
}
pushFragChars(current);
}
return { text: normalizeCompareText(merged), charMap };
}
function mergeWordTokenRects(
left: CompareWordToken,
right: CompareWordToken
): CompareWordToken {
const minX = Math.min(left.rect.x, right.rect.x);
const minY = Math.min(left.rect.y, right.rect.y);
const maxX = Math.max(
left.rect.x + left.rect.width,
right.rect.x + right.rect.width
);
const maxY = Math.max(
left.rect.y + left.rect.height,
right.rect.y + right.rect.height
);
return {
word: `${left.word}${right.word}`,
compareWord: `${left.compareWord}${right.compareWord}`,
rect: {
x: minX,
y: minY,
width: maxX - minX,
height: maxY - minY,
},
};
}
function buildMergedWordTokens(lineItems: CompareTextItem[]) {
if (
!lineItems.some((item) => item.wordTokens && item.wordTokens.length > 0)
) {
return undefined;
}
const mergedTokens: CompareWordToken[] = [];
let previousItem: CompareTextItem | null = null;
for (const item of lineItems) {
const itemTokens =
item.wordTokens && item.wordTokens.length > 0
? item.wordTokens
: [
{
word: item.normalizedText,
compareWord: item.normalizedText.toLowerCase(),
rect: item.rect,
} satisfies CompareWordToken,
];
itemTokens.forEach((token, tokenIndex) => {
const joinsAcrossItems =
tokenIndex === 0 && previousItem
? !shouldInsertSpaceBetweenItems(previousItem, item)
: false;
const shouldJoin =
mergedTokens.length > 0 &&
(tokenIndex > 0 ? Boolean(token.joinsWithPrevious) : joinsAcrossItems);
if (shouldJoin) {
mergedTokens[mergedTokens.length - 1] = mergeWordTokenRects(
mergedTokens[mergedTokens.length - 1],
token
);
} else {
mergedTokens.push({
word: token.word,
compareWord: token.compareWord,
rect: token.rect,
});
}
});
previousItem = item;
}
return mergedTokens;
}
export function mergeIntoLines(
sortedItems: CompareTextItem[]
): CompareTextItem[] {
if (sortedItems.length === 0) return [];
const lines: CompareTextItem[][] = [];
let currentLine: CompareTextItem[] = [sortedItems[0]];
for (let i = 1; i < sortedItems.length; i++) {
const anchor = currentLine[0];
const curr = sortedItems[i];
const lineTolerance = Math.max(
Math.min(anchor.rect.height, curr.rect.height) * 0.6,
4
);
if (Math.abs(curr.rect.y - anchor.rect.y) <= lineTolerance) {
currentLine.push(curr);
} else {
lines.push(currentLine);
currentLine = [curr];
}
}
lines.push(currentLine);
return lines.map((lineItems, lineIndex) => {
const { text: normalizedText, charMap } = mergeLineText(lineItems);
const minX = Math.min(...lineItems.map((item) => item.rect.x));
const minY = Math.min(...lineItems.map((item) => item.rect.y));
const maxX = Math.max(
...lineItems.map((item) => item.rect.x + item.rect.width)
);
const maxY = Math.max(
...lineItems.map((item) => item.rect.y + item.rect.height)
);
return {
id: `line-${lineIndex}`,
text: lineItems.map((item) => item.text).join(' '),
normalizedText,
rect: {
x: minX,
y: minY,
width: maxX - minX,
height: maxY - minY,
},
fragments: lineItems,
charMap,
wordTokens: buildMergedWordTokens(lineItems),
};
});
}
export async function extractPageModel(
page: pdfjsLib.PDFPageProxy,
viewport: pdfjsLib.PageViewport
): Promise<ComparePageModel> {
const textContent = await page.getTextContent({
disableCombineTextItems: true,
});
const styles = textContent.styles ?? {};
const rawItems = sortCompareTextItems(
textContent.items
.filter((item): item is PageTextItem => 'str' in item)
.map((item, index) => toRect(viewport, item, index, styles))
.filter((item) => item.normalizedText.length > 0)
);
const textItems = mergeIntoLines(rawItems);
return {
pageNumber: page.pageNumber,
width: viewport.width,
height: viewport.height,
textItems,
plainText: joinCompareTextItems(textItems),
hasText: textItems.length > 0,
source: 'pdfjs',
};
}