refactor: update PDF comparison types and enhance UI for better usability

- Refactored CompareState to import from a centralized type definition. - Enhanced the compare-pdfs.html layout with improved styles for overlay and side-by-side modes. - Added new CSS styles for various UI components including panels, buttons, and highlights. - Implemented a new sidebar for displaying change summaries and filters. - Created unit tests for text comparison logic, including diffing text runs and page pairing. - Added tests for text normalization functions to ensure proper handling of punctuation and character normalization.
2026-03-08 23:55:04 +05:30
parent 86cbaf6cd3
commit 1d68691331
20 changed files with 3447 additions and 332 deletions
--- a/src/tests/compare/diff-text-runs.test.ts
+++ b/src/tests/compare/diff-text-runs.test.ts
@@ -0,0 +1,313 @@
+import { describe, expect, it } from 'vitest';
+
+import { comparePageModels } from '@/js/compare/engine/compare-page-models.ts';
+import { diffTextRuns } from '@/js/compare/engine/diff-text-runs.ts';
+import {
+  mergeIntoLines,
+  sortCompareTextItems,
+} from '@/js/compare/engine/extract-page-model.ts';
+import type { ComparePageModel, CompareTextItem } from '@/js/compare/types.ts';
+
+function makeItem(id: string, text: string): CompareTextItem {
+  return {
+    id,
+    text,
+    normalizedText: text,
+    rect: { x: 0, y: 0, width: 10, height: 10 },
+  };
+}
+
+function makePage(
+  pageNumber: number,
+  textItems: CompareTextItem[]
+): ComparePageModel {
+  return {
+    pageNumber,
+    width: 100,
+    height: 100,
+    textItems,
+    plainText: textItems.map((item) => item.normalizedText).join(' '),
+    hasText: textItems.length > 0,
+    source: 'pdfjs',
+  };
+}
+
+describe('diffTextRuns', () => {
+  it('detects modified tokens as one change', () => {
+    const result = diffTextRuns(
+      [makeItem('a', 'Hello'), makeItem('b', 'world')],
+      [makeItem('a', 'Hello'), makeItem('c', 'there')]
+    );
+
+    expect(result.summary).toEqual({ added: 0, removed: 0, modified: 1 });
+    expect(result.changes).toHaveLength(1);
+    expect(result.changes[0].type).toBe('modified');
+    expect(result.changes[0].beforeText).toBe('world');
+    expect(result.changes[0].afterText).toBe('there');
+  });
+
+  it('detects added tokens', () => {
+    const result = diffTextRuns(
+      [makeItem('a', 'Hello')],
+      [makeItem('a', 'Hello'), makeItem('b', 'again')]
+    );
+
+    expect(result.summary).toEqual({ added: 1, removed: 0, modified: 0 });
+    expect(result.changes[0].type).toBe('added');
+  });
+
+  it('splits compound replacements into discrete changes', () => {
+    const result = diffTextRuns(
+      [
+        makeItem('a', 'This'),
+        makeItem('b', 'is'),
+        makeItem('c', 'an'),
+        makeItem('d', 'example'),
+        makeItem('e', 'of'),
+        makeItem('f', 'a'),
+        makeItem('g', 'data'),
+        makeItem('h', 'table'),
+        makeItem('i', 'new.'),
+        makeItem('j', 'Disabilit'),
+      ],
+      [
+        makeItem('k', 'Example'),
+        makeItem('l', 'table'),
+        makeItem('m', 'This'),
+        makeItem('n', 'is'),
+        makeItem('o', 'an'),
+        makeItem('p', 'example'),
+        makeItem('q', 'of'),
+        makeItem('r', 'a'),
+        makeItem('s', 'data'),
+        makeItem('t', 'table.'),
+        makeItem('u', 'Disability'),
+      ]
+    );
+
+    expect(result.changes).toHaveLength(2);
+    expect(result.summary).toEqual({ added: 1, removed: 0, modified: 1 });
+    expect(
+      result.changes.some(
+        (change) =>
+          change.type === 'added' && change.afterText === 'Example table'
+      )
+    ).toBe(true);
+    expect(
+      result.changes.some(
+        (change) =>
+          change.type === 'modified' &&
+          change.beforeText === 'table new. Disabilit' &&
+          change.afterText === 'table. Disability'
+      )
+    ).toBe(true);
+  });
+});
+
+describe('comparePageModels', () => {
+  it('marks pages missing from the second document', () => {
+    const result = comparePageModels(
+      makePage(3, [makeItem('a', 'Only')]),
+      null
+    );
+
+    expect(result.status).toBe('left-only');
+    expect(result.summary.removed).toBe(1);
+    expect(result.changes[0].type).toBe('page-removed');
+  });
+});
+
+describe('sortCompareTextItems', () => {
+  it('orders tokens by reading order', () => {
+    const items: CompareTextItem[] = [
+      {
+        ...makeItem('b', 'Body'),
+        rect: { x: 60, y: 40, width: 10, height: 10 },
+      },
+      {
+        ...makeItem('a', 'Title'),
+        rect: { x: 10, y: 10, width: 10, height: 10 },
+      },
+      {
+        ...makeItem('c', 'Next'),
+        rect: { x: 10, y: 40, width: 10, height: 10 },
+      },
+    ];
+
+    expect(
+      sortCompareTextItems(items).map((item) => item.normalizedText)
+    ).toEqual(['Title', 'Next', 'Body']);
+  });
+});
+
+describe('mergeIntoLines', () => {
+  it('merges items on the same Y-line into one item', () => {
+    const items: CompareTextItem[] = [
+      {
+        id: '0',
+        text: 'Hello',
+        normalizedText: 'Hello',
+        rect: { x: 0, y: 10, width: 50, height: 12 },
+      },
+      {
+        id: '1',
+        text: 'World',
+        normalizedText: 'World',
+        rect: { x: 60, y: 10, width: 50, height: 12 },
+      },
+    ];
+    const merged = mergeIntoLines(sortCompareTextItems(items));
+
+    expect(merged).toHaveLength(1);
+    expect(merged[0].normalizedText).toBe('Hello World');
+    expect(merged[0].rect.x).toBe(0);
+    expect(merged[0].rect.width).toBe(110);
+  });
+
+  it('does not insert spaces inside a split word', () => {
+    const items: CompareTextItem[] = [
+      {
+        id: '0',
+        text: 'sam',
+        normalizedText: 'sam',
+        rect: { x: 0, y: 10, width: 24, height: 12 },
+      },
+      {
+        id: '1',
+        text: 'e',
+        normalizedText: 'e',
+        rect: { x: 24.4, y: 10, width: 8, height: 12 },
+      },
+    ];
+
+    const merged = mergeIntoLines(sortCompareTextItems(items));
+
+    expect(merged).toHaveLength(1);
+    expect(merged[0].normalizedText).toBe('same');
+  });
+
+  it('keeps items on different Y-lines separate', () => {
+    const items: CompareTextItem[] = [
+      {
+        id: '0',
+        text: 'Line 1',
+        normalizedText: 'Line 1',
+        rect: { x: 0, y: 10, width: 50, height: 12 },
+      },
+      {
+        id: '1',
+        text: 'Line 2',
+        normalizedText: 'Line 2',
+        rect: { x: 0, y: 30, width: 50, height: 12 },
+      },
+    ];
+    const merged = mergeIntoLines(sortCompareTextItems(items));
+
+    expect(merged).toHaveLength(2);
+    expect(merged[0].normalizedText).toBe('Line 1');
+    expect(merged[1].normalizedText).toBe('Line 2');
+  });
+
+  it('produces same result for different text run boundaries', () => {
+    const pdf1Items: CompareTextItem[] = [
+      {
+        id: '0',
+        text: 'Hello World',
+        normalizedText: 'Hello World',
+        rect: { x: 0, y: 10, width: 100, height: 12 },
+      },
+    ];
+    const pdf2Items: CompareTextItem[] = [
+      {
+        id: '0',
+        text: 'Hello',
+        normalizedText: 'Hello',
+        rect: { x: 0, y: 10, width: 45, height: 12 },
+      },
+      {
+        id: '1',
+        text: 'World',
+        normalizedText: 'World',
+        rect: { x: 55, y: 10, width: 45, height: 12 },
+      },
+    ];
+
+    const merged1 = mergeIntoLines(sortCompareTextItems(pdf1Items));
+    const merged2 = mergeIntoLines(sortCompareTextItems(pdf2Items));
+
+    expect(merged1[0].normalizedText).toBe(merged2[0].normalizedText);
+
+    const result = diffTextRuns(merged1, merged2);
+    expect(result.changes).toHaveLength(0);
+  });
+
+  it('detects actual changes after merging', () => {
+    const pdf1Items: CompareTextItem[] = [
+      {
+        id: '0',
+        text: 'Sample',
+        normalizedText: 'Sample',
+        rect: { x: 0, y: 10, width: 60, height: 14 },
+      },
+      {
+        id: '1',
+        text: 'page text here',
+        normalizedText: 'page text here',
+        rect: { x: 0, y: 30, width: 120, height: 14 },
+      },
+    ];
+    const pdf2Items: CompareTextItem[] = [
+      {
+        id: '0',
+        text: 'Sample',
+        normalizedText: 'Sample',
+        rect: { x: 0, y: 10, width: 45, height: 14 },
+      },
+      {
+        id: '1',
+        text: 'PDF',
+        normalizedText: 'PDF',
+        rect: { x: 55, y: 10, width: 30, height: 14 },
+      },
+      {
+        id: '2',
+        text: 'pages text here',
+        normalizedText: 'pages text here',
+        rect: { x: 0, y: 30, width: 125, height: 14 },
+      },
+    ];
+
+    const merged1 = mergeIntoLines(sortCompareTextItems(pdf1Items));
+    const merged2 = mergeIntoLines(sortCompareTextItems(pdf2Items));
+
+    expect(merged1).toHaveLength(2);
+    expect(merged2).toHaveLength(2);
+
+    const result = diffTextRuns(merged1, merged2);
+    expect(result.summary.modified).toBe(1);
+    expect(result.summary.added).toBe(0);
+    expect(result.summary.removed).toBe(0);
+    expect(result.changes).toHaveLength(1);
+    expect(result.changes[0].beforeText).toBe('page');
+    expect(result.changes[0].afterText).toBe('PDF pages');
+  });
+
+  it('preserves original casing in change descriptions', () => {
+    const result = diffTextRuns(
+      [makeItem('a', 'Sample')],
+      [makeItem('b', 'Sample PDF')]
+    );
+
+    expect(result.changes[0].afterText).toBe('PDF');
+  });
+
+  it('ignores joined versus split words when collapsed text matches', () => {
+    const result = diffTextRuns(
+      [makeItem('a', 'non'), makeItem('b', 'tincidunt')],
+      [makeItem('c', 'nontincidunt')]
+    );
+
+    expect(result.changes).toHaveLength(0);
+    expect(result.summary).toEqual({ added: 0, removed: 0, modified: 0 });
+  });
+});