521 lines
14 KiB
TypeScript
521 lines
14 KiB
TypeScript
|
|
import * as pdfjsLib from 'pdfjs-dist';
|
|||
|
|
|
|||
|
|
import type {
|
|||
|
|
ComparePageModel,
|
|||
|
|
CompareTextItem,
|
|||
|
|
CharPosition,
|
|||
|
|
CompareWordToken,
|
|||
|
|
} from '../types.ts';
|
|||
|
|
import {
|
|||
|
|
joinCompareTextItems,
|
|||
|
|
normalizeCompareText,
|
|||
|
|
} from './text-normalization.ts';
|
|||
|
|
|
|||
|
|
type PageTextItem = {
|
|||
|
|
str: string;
|
|||
|
|
width: number;
|
|||
|
|
height: number;
|
|||
|
|
transform: number[];
|
|||
|
|
dir: string;
|
|||
|
|
fontName: string;
|
|||
|
|
hasEOL: boolean;
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
type TextStyles = Record<string, { fontFamily?: string }>;
|
|||
|
|
|
|||
|
|
const measurementCanvas =
|
|||
|
|
typeof document !== 'undefined' ? document.createElement('canvas') : null;
|
|||
|
|
const measurementContext = measurementCanvas
|
|||
|
|
? measurementCanvas.getContext('2d')
|
|||
|
|
: null;
|
|||
|
|
const textMeasurementCache: Map<string, number> | null = measurementContext
|
|||
|
|
? new Map()
|
|||
|
|
: null;
|
|||
|
|
let lastMeasurementFont = '';
|
|||
|
|
|
|||
|
|
const DEFAULT_CHAR_WIDTH = 1;
|
|||
|
|
const DEFAULT_SPACE_WIDTH = 0.33;
|
|||
|
|
|
|||
|
|
function shouldJoinTokenWithPrevious(previous: string, current: string) {
|
|||
|
|
if (!previous) return false;
|
|||
|
|
if (/^[,.;:!?%)\]}]/.test(current)) return true;
|
|||
|
|
if (/^[''"'’”]/u.test(current)) return true;
|
|||
|
|
if (/[([{/"'“‘-]$/u.test(previous)) return true;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function measureTextWidth(fontSpec: string, text: string): number {
|
|||
|
|
if (!measurementContext) {
|
|||
|
|
if (!text) return 0;
|
|||
|
|
if (text === ' ') return DEFAULT_SPACE_WIDTH;
|
|||
|
|
return text.length * DEFAULT_CHAR_WIDTH;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (lastMeasurementFont !== fontSpec) {
|
|||
|
|
measurementContext.font = fontSpec;
|
|||
|
|
lastMeasurementFont = fontSpec;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const key = `${fontSpec}|${text}`;
|
|||
|
|
const cached = textMeasurementCache?.get(key);
|
|||
|
|
if (cached !== undefined) {
|
|||
|
|
return cached;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const width = measurementContext.measureText(text).width || 0;
|
|||
|
|
textMeasurementCache?.set(key, width);
|
|||
|
|
return width;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function buildItemWordTokens(
|
|||
|
|
viewport: pdfjsLib.PageViewport,
|
|||
|
|
item: PageTextItem,
|
|||
|
|
fallbackRect: CompareTextItem['rect'],
|
|||
|
|
styles: TextStyles
|
|||
|
|
): CompareWordToken[] {
|
|||
|
|
const rawText = item.str || '';
|
|||
|
|
if (!rawText.trim()) {
|
|||
|
|
return [];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const totalLen = Math.max(rawText.length, 1);
|
|||
|
|
const textStyle = item.fontName ? styles[item.fontName] : undefined;
|
|||
|
|
const fontFamily = textStyle?.fontFamily ?? 'sans-serif';
|
|||
|
|
const fontScale = Math.max(
|
|||
|
|
0.5,
|
|||
|
|
Math.hypot(item.transform[0], item.transform[1]) || 0
|
|||
|
|
);
|
|||
|
|
const fontSpec = `${fontScale}px ${fontFamily}`;
|
|||
|
|
|
|||
|
|
const weights: number[] = new Array(totalLen);
|
|||
|
|
let runningText = '';
|
|||
|
|
let previousAdvance = 0;
|
|||
|
|
for (let index = 0; index < totalLen; index += 1) {
|
|||
|
|
runningText += rawText[index];
|
|||
|
|
const advance = measureTextWidth(fontSpec, runningText);
|
|||
|
|
let width = advance - previousAdvance;
|
|||
|
|
if (!Number.isFinite(width) || width <= 0) {
|
|||
|
|
width = rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH;
|
|||
|
|
}
|
|||
|
|
weights[index] = width;
|
|||
|
|
previousAdvance = advance;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!Number.isFinite(previousAdvance) || previousAdvance <= 0) {
|
|||
|
|
for (let index = 0; index < totalLen; index += 1) {
|
|||
|
|
weights[index] =
|
|||
|
|
rawText[index] === ' ' ? DEFAULT_SPACE_WIDTH : DEFAULT_CHAR_WIDTH;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const prefix: number[] = new Array(totalLen + 1);
|
|||
|
|
prefix[0] = 0;
|
|||
|
|
for (let index = 0; index < totalLen; index += 1) {
|
|||
|
|
prefix[index + 1] = prefix[index] + weights[index];
|
|||
|
|
}
|
|||
|
|
const totalWeight = prefix[totalLen] || 1;
|
|||
|
|
|
|||
|
|
const rawX = item.transform[4];
|
|||
|
|
const rawY = item.transform[5];
|
|||
|
|
const transformed = [
|
|||
|
|
viewport.convertToViewportPoint(rawX, rawY),
|
|||
|
|
viewport.convertToViewportPoint(rawX + item.width, rawY),
|
|||
|
|
viewport.convertToViewportPoint(rawX, rawY + item.height),
|
|||
|
|
viewport.convertToViewportPoint(rawX + item.width, rawY + item.height),
|
|||
|
|
];
|
|||
|
|
const xs = transformed.map(([x]) => x);
|
|||
|
|
const ys = transformed.map(([, y]) => y);
|
|||
|
|
const left = Math.min(...xs);
|
|||
|
|
const right = Math.max(...xs);
|
|||
|
|
const top = Math.min(...ys);
|
|||
|
|
const bottom = Math.max(...ys);
|
|||
|
|
|
|||
|
|
const [baselineStart, baselineEnd, verticalEnd] = transformed;
|
|||
|
|
const baselineVector: [number, number] = [
|
|||
|
|
baselineEnd[0] - baselineStart[0],
|
|||
|
|
baselineEnd[1] - baselineStart[1],
|
|||
|
|
];
|
|||
|
|
const verticalVector: [number, number] = [
|
|||
|
|
verticalEnd[0] - baselineStart[0],
|
|||
|
|
verticalEnd[1] - baselineStart[1],
|
|||
|
|
];
|
|||
|
|
const hasOrientationVectors =
|
|||
|
|
Math.hypot(baselineVector[0], baselineVector[1]) > 1e-6 &&
|
|||
|
|
Math.hypot(verticalVector[0], verticalVector[1]) > 1e-6;
|
|||
|
|
|
|||
|
|
const tokens: CompareWordToken[] = [];
|
|||
|
|
const wordRegex = /\S+/gu;
|
|||
|
|
let match: RegExpExecArray | null;
|
|||
|
|
let previousEnd = 0;
|
|||
|
|
|
|||
|
|
while ((match = wordRegex.exec(rawText)) !== null) {
|
|||
|
|
const tokenText = match[0];
|
|||
|
|
const normalizedWord = normalizeCompareText(tokenText);
|
|||
|
|
if (!normalizedWord) {
|
|||
|
|
previousEnd = match.index + tokenText.length;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const startIndex = match.index;
|
|||
|
|
const endIndex = startIndex + tokenText.length;
|
|||
|
|
const relStart = prefix[startIndex] / totalWeight;
|
|||
|
|
const relEnd = prefix[endIndex] / totalWeight;
|
|||
|
|
|
|||
|
|
let wordLeft: number;
|
|||
|
|
let wordRight: number;
|
|||
|
|
let wordTop: number;
|
|||
|
|
let wordBottom: number;
|
|||
|
|
|
|||
|
|
if (hasOrientationVectors) {
|
|||
|
|
const segStart: [number, number] = [
|
|||
|
|
baselineStart[0] + baselineVector[0] * relStart,
|
|||
|
|
baselineStart[1] + baselineVector[1] * relStart,
|
|||
|
|
];
|
|||
|
|
const segEnd: [number, number] = [
|
|||
|
|
baselineStart[0] + baselineVector[0] * relEnd,
|
|||
|
|
baselineStart[1] + baselineVector[1] * relEnd,
|
|||
|
|
];
|
|||
|
|
const cornerPoints: Array<[number, number]> = [
|
|||
|
|
segStart,
|
|||
|
|
[segStart[0] + verticalVector[0], segStart[1] + verticalVector[1]],
|
|||
|
|
[segEnd[0] + verticalVector[0], segEnd[1] + verticalVector[1]],
|
|||
|
|
segEnd,
|
|||
|
|
];
|
|||
|
|
wordLeft = Math.min(...cornerPoints.map(([x]) => x));
|
|||
|
|
wordRight = Math.max(...cornerPoints.map(([x]) => x));
|
|||
|
|
wordTop = Math.min(...cornerPoints.map(([, y]) => y));
|
|||
|
|
wordBottom = Math.max(...cornerPoints.map(([, y]) => y));
|
|||
|
|
} else {
|
|||
|
|
const segLeft = left + (right - left) * relStart;
|
|||
|
|
const segRight = left + (right - left) * relEnd;
|
|||
|
|
wordLeft = Math.min(segLeft, segRight);
|
|||
|
|
wordRight = Math.max(segLeft, segRight);
|
|||
|
|
wordTop = top;
|
|||
|
|
wordBottom = bottom;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const width = Math.max(wordRight - wordLeft, 1);
|
|||
|
|
const height = Math.max(wordBottom - wordTop, fallbackRect.height);
|
|||
|
|
const gapText = rawText.slice(previousEnd, startIndex);
|
|||
|
|
|
|||
|
|
const previousToken = tokens[tokens.length - 1];
|
|||
|
|
|
|||
|
|
tokens.push({
|
|||
|
|
word: normalizedWord,
|
|||
|
|
compareWord: normalizedWord.toLowerCase(),
|
|||
|
|
rect: {
|
|||
|
|
x: Number.isFinite(wordLeft) ? wordLeft : fallbackRect.x,
|
|||
|
|
y: Number.isFinite(wordTop) ? wordTop : fallbackRect.y,
|
|||
|
|
width,
|
|||
|
|
height,
|
|||
|
|
},
|
|||
|
|
joinsWithPrevious:
|
|||
|
|
(gapText.length > 0 && !/\s/u.test(gapText)) ||
|
|||
|
|
(previousToken
|
|||
|
|
? shouldJoinTokenWithPrevious(previousToken.word, normalizedWord)
|
|||
|
|
: false),
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
previousEnd = endIndex;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return tokens;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function toRect(
|
|||
|
|
viewport: pdfjsLib.PageViewport,
|
|||
|
|
item: PageTextItem,
|
|||
|
|
index: number,
|
|||
|
|
styles: TextStyles
|
|||
|
|
) {
|
|||
|
|
const normalizedText = normalizeCompareText(item.str);
|
|||
|
|
|
|||
|
|
const transformed = pdfjsLib.Util.transform(
|
|||
|
|
viewport.transform,
|
|||
|
|
item.transform
|
|||
|
|
);
|
|||
|
|
const width = Math.max(item.width * viewport.scale, 1);
|
|||
|
|
const height = Math.max(
|
|||
|
|
Math.abs(transformed[3]) || item.height * viewport.scale,
|
|||
|
|
1
|
|||
|
|
);
|
|||
|
|
const x = transformed[4];
|
|||
|
|
const y = transformed[5] - height;
|
|||
|
|
|
|||
|
|
const rect = {
|
|||
|
|
x,
|
|||
|
|
y,
|
|||
|
|
width,
|
|||
|
|
height,
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
id: `${index}-${normalizedText}`,
|
|||
|
|
text: item.str,
|
|||
|
|
normalizedText,
|
|||
|
|
rect,
|
|||
|
|
wordTokens: buildItemWordTokens(viewport, item, rect, styles),
|
|||
|
|
} satisfies CompareTextItem;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function sortCompareTextItems(items: CompareTextItem[]) {
|
|||
|
|
return [...items].sort((left, right) => {
|
|||
|
|
const lineTolerance = Math.max(
|
|||
|
|
Math.min(left.rect.height, right.rect.height) * 0.6,
|
|||
|
|
4
|
|||
|
|
);
|
|||
|
|
const topDiff = left.rect.y - right.rect.y;
|
|||
|
|
|
|||
|
|
if (Math.abs(topDiff) > lineTolerance) {
|
|||
|
|
return topDiff;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const xDiff = left.rect.x - right.rect.x;
|
|||
|
|
if (Math.abs(xDiff) > 1) {
|
|||
|
|
return xDiff;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return left.id.localeCompare(right.id);
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function averageCharacterWidth(item: CompareTextItem) {
|
|||
|
|
const compactText = item.normalizedText.replace(/\s+/g, '');
|
|||
|
|
return item.rect.width / Math.max(compactText.length, 1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function shouldInsertSpaceBetweenItems(
|
|||
|
|
left: CompareTextItem,
|
|||
|
|
right: CompareTextItem
|
|||
|
|
) {
|
|||
|
|
if (!left.normalizedText || !right.normalizedText) {
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (/^[,.;:!?%)\]}]/.test(right.normalizedText)) {
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (/^[''"'’”]/u.test(right.normalizedText)) {
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (/[([{/"'“‘-]$/u.test(left.normalizedText)) {
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const gap = right.rect.x - (left.rect.x + left.rect.width);
|
|||
|
|
if (gap <= 0) {
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const leftWidth = averageCharacterWidth(left);
|
|||
|
|
const rightWidth = averageCharacterWidth(right);
|
|||
|
|
const threshold = Math.max(Math.min(leftWidth, rightWidth) * 0.45, 1.5);
|
|||
|
|
|
|||
|
|
return gap >= threshold;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function mergeLineText(lineItems: CompareTextItem[]): {
|
|||
|
|
text: string;
|
|||
|
|
charMap: CharPosition[];
|
|||
|
|
} {
|
|||
|
|
if (lineItems.length === 0) {
|
|||
|
|
return { text: '', charMap: [] };
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const charMap: CharPosition[] = [];
|
|||
|
|
|
|||
|
|
function pushFragChars(frag: CompareTextItem) {
|
|||
|
|
const fragText = frag.normalizedText;
|
|||
|
|
const fragCharWidth = frag.rect.width / Math.max(fragText.length, 1);
|
|||
|
|
for (let ci = 0; ci < fragText.length; ci++) {
|
|||
|
|
charMap.push({
|
|||
|
|
x: frag.rect.x + ci * fragCharWidth,
|
|||
|
|
width: fragCharWidth,
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let merged = lineItems[0].normalizedText;
|
|||
|
|
pushFragChars(lineItems[0]);
|
|||
|
|
|
|||
|
|
for (let index = 1; index < lineItems.length; index += 1) {
|
|||
|
|
const previous = lineItems[index - 1];
|
|||
|
|
const current = lineItems[index];
|
|||
|
|
|
|||
|
|
if (shouldInsertSpaceBetweenItems(previous, current)) {
|
|||
|
|
const gap = current.rect.x - (previous.rect.x + previous.rect.width);
|
|||
|
|
charMap.push({
|
|||
|
|
x: previous.rect.x + previous.rect.width,
|
|||
|
|
width: Math.max(gap, 1),
|
|||
|
|
});
|
|||
|
|
merged += ` ${current.normalizedText}`;
|
|||
|
|
} else {
|
|||
|
|
merged += current.normalizedText;
|
|||
|
|
}
|
|||
|
|
pushFragChars(current);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return { text: normalizeCompareText(merged), charMap };
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function mergeWordTokenRects(
|
|||
|
|
left: CompareWordToken,
|
|||
|
|
right: CompareWordToken
|
|||
|
|
): CompareWordToken {
|
|||
|
|
const minX = Math.min(left.rect.x, right.rect.x);
|
|||
|
|
const minY = Math.min(left.rect.y, right.rect.y);
|
|||
|
|
const maxX = Math.max(
|
|||
|
|
left.rect.x + left.rect.width,
|
|||
|
|
right.rect.x + right.rect.width
|
|||
|
|
);
|
|||
|
|
const maxY = Math.max(
|
|||
|
|
left.rect.y + left.rect.height,
|
|||
|
|
right.rect.y + right.rect.height
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
word: `${left.word}${right.word}`,
|
|||
|
|
compareWord: `${left.compareWord}${right.compareWord}`,
|
|||
|
|
rect: {
|
|||
|
|
x: minX,
|
|||
|
|
y: minY,
|
|||
|
|
width: maxX - minX,
|
|||
|
|
height: maxY - minY,
|
|||
|
|
},
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function buildMergedWordTokens(lineItems: CompareTextItem[]) {
|
|||
|
|
if (
|
|||
|
|
!lineItems.some((item) => item.wordTokens && item.wordTokens.length > 0)
|
|||
|
|
) {
|
|||
|
|
return undefined;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const mergedTokens: CompareWordToken[] = [];
|
|||
|
|
let previousItem: CompareTextItem | null = null;
|
|||
|
|
|
|||
|
|
for (const item of lineItems) {
|
|||
|
|
const itemTokens =
|
|||
|
|
item.wordTokens && item.wordTokens.length > 0
|
|||
|
|
? item.wordTokens
|
|||
|
|
: [
|
|||
|
|
{
|
|||
|
|
word: item.normalizedText,
|
|||
|
|
compareWord: item.normalizedText.toLowerCase(),
|
|||
|
|
rect: item.rect,
|
|||
|
|
} satisfies CompareWordToken,
|
|||
|
|
];
|
|||
|
|
|
|||
|
|
itemTokens.forEach((token, tokenIndex) => {
|
|||
|
|
const joinsAcrossItems =
|
|||
|
|
tokenIndex === 0 && previousItem
|
|||
|
|
? !shouldInsertSpaceBetweenItems(previousItem, item)
|
|||
|
|
: false;
|
|||
|
|
const shouldJoin =
|
|||
|
|
mergedTokens.length > 0 &&
|
|||
|
|
(tokenIndex > 0 ? Boolean(token.joinsWithPrevious) : joinsAcrossItems);
|
|||
|
|
|
|||
|
|
if (shouldJoin) {
|
|||
|
|
mergedTokens[mergedTokens.length - 1] = mergeWordTokenRects(
|
|||
|
|
mergedTokens[mergedTokens.length - 1],
|
|||
|
|
token
|
|||
|
|
);
|
|||
|
|
} else {
|
|||
|
|
mergedTokens.push({
|
|||
|
|
word: token.word,
|
|||
|
|
compareWord: token.compareWord,
|
|||
|
|
rect: token.rect,
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
previousItem = item;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return mergedTokens;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function mergeIntoLines(
|
|||
|
|
sortedItems: CompareTextItem[]
|
|||
|
|
): CompareTextItem[] {
|
|||
|
|
if (sortedItems.length === 0) return [];
|
|||
|
|
|
|||
|
|
const lines: CompareTextItem[][] = [];
|
|||
|
|
let currentLine: CompareTextItem[] = [sortedItems[0]];
|
|||
|
|
|
|||
|
|
for (let i = 1; i < sortedItems.length; i++) {
|
|||
|
|
const anchor = currentLine[0];
|
|||
|
|
const curr = sortedItems[i];
|
|||
|
|
const lineTolerance = Math.max(
|
|||
|
|
Math.min(anchor.rect.height, curr.rect.height) * 0.6,
|
|||
|
|
4
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
if (Math.abs(curr.rect.y - anchor.rect.y) <= lineTolerance) {
|
|||
|
|
currentLine.push(curr);
|
|||
|
|
} else {
|
|||
|
|
lines.push(currentLine);
|
|||
|
|
currentLine = [curr];
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
lines.push(currentLine);
|
|||
|
|
|
|||
|
|
return lines.map((lineItems, lineIndex) => {
|
|||
|
|
const { text: normalizedText, charMap } = mergeLineText(lineItems);
|
|||
|
|
|
|||
|
|
const minX = Math.min(...lineItems.map((item) => item.rect.x));
|
|||
|
|
const minY = Math.min(...lineItems.map((item) => item.rect.y));
|
|||
|
|
const maxX = Math.max(
|
|||
|
|
...lineItems.map((item) => item.rect.x + item.rect.width)
|
|||
|
|
);
|
|||
|
|
const maxY = Math.max(
|
|||
|
|
...lineItems.map((item) => item.rect.y + item.rect.height)
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
id: `line-${lineIndex}`,
|
|||
|
|
text: lineItems.map((item) => item.text).join(' '),
|
|||
|
|
normalizedText,
|
|||
|
|
rect: {
|
|||
|
|
x: minX,
|
|||
|
|
y: minY,
|
|||
|
|
width: maxX - minX,
|
|||
|
|
height: maxY - minY,
|
|||
|
|
},
|
|||
|
|
fragments: lineItems,
|
|||
|
|
charMap,
|
|||
|
|
wordTokens: buildMergedWordTokens(lineItems),
|
|||
|
|
};
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export async function extractPageModel(
|
|||
|
|
page: pdfjsLib.PDFPageProxy,
|
|||
|
|
viewport: pdfjsLib.PageViewport
|
|||
|
|
): Promise<ComparePageModel> {
|
|||
|
|
const textContent = await page.getTextContent({
|
|||
|
|
disableCombineTextItems: true,
|
|||
|
|
});
|
|||
|
|
const styles = textContent.styles ?? {};
|
|||
|
|
const rawItems = sortCompareTextItems(
|
|||
|
|
textContent.items
|
|||
|
|
.filter((item): item is PageTextItem => 'str' in item)
|
|||
|
|
.map((item, index) => toRect(viewport, item, index, styles))
|
|||
|
|
.filter((item) => item.normalizedText.length > 0)
|
|||
|
|
);
|
|||
|
|
const textItems = mergeIntoLines(rawItems);
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
pageNumber: page.pageNumber,
|
|||
|
|
width: viewport.width,
|
|||
|
|
height: viewport.height,
|
|||
|
|
textItems,
|
|||
|
|
plainText: joinCompareTextItems(textItems),
|
|||
|
|
hasText: textItems.length > 0,
|
|||
|
|
source: 'pdfjs',
|
|||
|
|
};
|
|||
|
|
}
|