diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/__tests__/cellCountUtils.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/__tests__/cellCountUtils.ts new file mode 100644 index 00000000..4911e4e8 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/__tests__/cellCountUtils.ts @@ -0,0 +1,26 @@ +import { NotebookPair } from '../../types/common'; + +/** + * Asserts that source and codex notebooks have matching cell counts + */ +export function assertMatchingCellCounts(pair: NotebookPair): void { + const count = (notebook: typeof pair.source) => { + const parts = { topLevel: 0, paratext: 0, child: 0 }; + for (const cell of notebook.cells) { + const isParatext = cell.metadata?.type === 'paratext'; + const isChild = cell.metadata?.isChild === true || cell.id.split(':').length > 2; + if (isParatext) parts.paratext++; + else if (isChild) parts.child++; + else parts.topLevel++; + } + return parts; + }; + + const s = count(pair.source); + const c = count(pair.codex); + + if (s.topLevel !== c.topLevel || s.paratext !== c.paratext || s.child !== c.child || pair.source.cells.length !== pair.codex.cells.length) { + throw new Error(`Cell count mismatch: topLevel ${s.topLevel}/${c.topLevel}, paratext ${s.paratext}/${c.paratext}, child ${s.child}/${c.child}, total ${pair.source.cells.length}/${pair.codex.cells.length}`); + } +} + diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/__tests__/subtitlesCellCount.test.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/__tests__/subtitlesCellCount.test.ts new file mode 100644 index 00000000..5da4ab27 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/__tests__/subtitlesCellCount.test.ts @@ -0,0 +1,47 @@ +import { describe, it, expect } from 'vitest'; +import { subtitlesImporter } from '../subtitles/index'; +import { assertMatchingCellCounts } from './cellCountUtils'; + +class MockFile { + readonly lastModified: number; + readonly name: string; + readonly size: number; + readonly type: string; + readonly webkitRelativePath: string = ''; + private readonly _content: string; + arrayBuffer(): Promise { throw new Error('Not implemented'); } + slice(): Blob { throw new Error('Not implemented'); } + stream(): ReadableStream { throw new Error('Not implemented'); } + constructor(content: string, name: string, type = 'text/vtt') { + this._content = content; + this.name = name; + this.type = type; + this.size = content.length; + this.lastModified = Date.now(); + } + text(): Promise { return Promise.resolve(this._content); } + [Symbol.toStringTag]: string = 'File'; +} + +describe('Subtitles Importer - Cell Count Consistency', () => { + it('should produce matching cell counts when same file imported twice', async () => { + const vtt = `WEBVTT + +00:00:30.697 --> 00:00:31.990 +The first time? + +00:00:32.783 --> 00:00:34.785 +You know the first time.`; + + const file = new MockFile(vtt, 'test.vtt') as unknown as File; + const result1 = await subtitlesImporter.parseFile(file); + const result2 = await subtitlesImporter.parseFile(file); + + expect(result1.success && result2.success).toBe(true); + assertMatchingCellCounts(result1.notebookPair!); + assertMatchingCellCounts(result2.notebookPair!); + expect(result1.notebookPair!.source.cells.length).toBe(result2.notebookPair!.source.cells.length); + expect(result1.notebookPair!.source.cells[0].id).toBe(result2.notebookPair!.source.cells[0].id); + }); +}); + diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/__tests__/usfmCellCount.test.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/__tests__/usfmCellCount.test.ts new file mode 100644 index 00000000..c783e55d --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/__tests__/usfmCellCount.test.ts @@ -0,0 +1,44 @@ +import { describe, it, expect } from 'vitest'; +import { parseFile } from '../usfm/index'; +import { assertMatchingCellCounts } from './cellCountUtils'; + +class MockFile { + readonly lastModified: number; + readonly name: string; + readonly size: number; + readonly type: string; + readonly webkitRelativePath: string = ''; + private readonly _content: string; + arrayBuffer(): Promise { throw new Error('Not implemented'); } + slice(): Blob { throw new Error('Not implemented'); } + stream(): ReadableStream { throw new Error('Not implemented'); } + constructor(content: string, name: string, type = 'text/plain') { + this._content = content; + this.name = name; + this.type = type; + this.size = content.length; + this.lastModified = Date.now(); + } + text(): Promise { return Promise.resolve(this._content); } + [Symbol.toStringTag]: string = 'File'; +} + +describe('USFM Importer - Cell Count Consistency', () => { + it('should produce matching cell counts when same file imported twice', async () => { + const usfm = `\\id GEN +\\c 1 +\\v 1 In the beginning God created the heavens and the earth. +\\p +\\v 2 Now the earth was formless and empty.`; + + const file = new MockFile(usfm, 'GEN.usfm') as unknown as File; + const result1 = await parseFile(file); + const result2 = await parseFile(file); + + expect(result1.success && result2.success).toBe(true); + assertMatchingCellCounts(result1.notebookPair!); + assertMatchingCellCounts(result2.notebookPair!); + expect(result1.notebookPair!.source.cells.length).toBe(result2.notebookPair!.source.cells.length); + }); +}); + diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts index 66a14c4f..066bc36d 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts @@ -18,6 +18,19 @@ import { validateFootnotes } from '../../utils/footnoteUtils'; // Deprecated: dynamic import of usfm-grammar. Replaced by lightweight regex parser. export const initializeUsfmGrammar = async () => { }; +/** + * Simple hash function for deterministic ID generation + */ +const simpleHash = (str: string): string => { + let hash = 0; + for (let i = 0; i < str.length; i++) { + const char = str.charCodeAt(i); + hash = ((hash << 5) - hash) + char; + hash = hash & hash; + } + return Math.abs(hash).toString(36).slice(0, 8); +}; + export interface UsfmContent { id: string; content: string; @@ -132,6 +145,7 @@ export const processUsfmContent = async ( chapters.add(chapterNumber); let seenFirstVerseInChapter = false; + let paratextIndex = 0; chapter.contents.forEach((content: any) => { if (content.verseNumber !== undefined && content.verseText !== undefined) { // This is a verse - process it for footnotes @@ -158,6 +172,7 @@ export const processUsfmContent = async ( }); // Create child cells for milestone spans (e.g., qt, ts) + let childIndex = 0; try { const milestoneTags = new Set(['qt', 'ts']); const parser = new DOMParser(); @@ -169,8 +184,9 @@ export const processUsfmContent = async ( const el = node as HTMLElement; const tag = el.getAttribute('data-tag'); if (tag && milestoneTags.has(tag)) { - const childId = `${verseId}:${Math.random().toString(36).slice(2, 11)}`; const innerHtml = el.innerHTML; + const childId = `${verseId}:child-${childIndex}`; + childIndex++; usfmContent.push({ id: childId, content: innerHtml, @@ -193,15 +209,17 @@ export const processUsfmContent = async ( }; Array.from(container.childNodes).forEach(walker); } - } catch { - // ignore child extraction errors + } catch (error) { + console.warn('Error creating child cells:', error); } seenFirstVerseInChapter = true; } else if (content.text && !content.marker) { // This is paratext (content without specific markers) - const paratextId = `${bookCode} ${chapterNumber}:${Math.random().toString(36).slice(2, 10)}`; const paratextContent = content.text.trim(); + const contentHash = simpleHash(paratextContent); + const paratextId = `${bookCode} ${chapterNumber}:paratext-${paratextIndex}-${contentHash}`; + paratextIndex++; const htmlParatext = paratextContent.length > 0 ? `

${convertUsfmInlineMarkersToHtml(paratextContent)}

` : paratextContent; // Convert USFM to HTML with footnotes if needed @@ -234,8 +252,10 @@ export const processUsfmContent = async ( }); } else if (content.marker) { // Preserve raw marker lines as paratext for round-trip fidelity - const paratextId = `${bookCode} ${chapterNumber}:${Math.random().toString(36).slice(2, 10)}`; const markerLine = String(content.marker).trim(); + const contentHash = simpleHash(markerLine); + const paratextId = `${bookCode} ${chapterNumber}:paratext-${paratextIndex}-${contentHash}`; + paratextIndex++; const htmlBlock = usfmBlockToHtml(markerLine); // Determine if style-only (no inner text) let cellType: UsfmContent['type'] = 'text'; @@ -281,6 +301,8 @@ export const processUsfmContent = async ( cellLabel: item.metadata.verse !== undefined ? item.metadata.verse?.toString() : undefined, originalText: item.metadata.originalText, fileName: item.metadata.fileName, + isChild: item.metadata.isChild, + parentId: item.metadata.parentId, }); }); @@ -381,7 +403,9 @@ export const createNotebookPair = ( id: sourceCell.id, content: isStyleCell ? sourceCell.content : '', images: sourceCell.images, - metadata: sourceCell.metadata, + metadata: { + ...sourceCell.metadata, + }, }; });