@@ -109,17 +109,105 @@ function normalizeBodySectionOrder(xmlDoc) {
109109 }
110110}
111111
112+ // ── Sanitize nested paragraphs ─────────────────────────
113+ /**
114+ * After redlining table content, the engine can produce nested w:p elements
115+ * inside table cells (w:tc > w:p > w:p). This flattens them by promoting
116+ * the inner w:p's children into the outer w:p, then removing the inner w:p.
117+ */
118+ function sanitizeNestedParagraphs ( xmlDoc ) {
119+ const tcs = xmlDoc . getElementsByTagNameNS ( NS_W , 'tc' ) ;
120+ let fixed = 0 ;
121+ for ( const tc of Array . from ( tcs ) ) {
122+ const outerParagraphs = Array . from ( tc . childNodes ) . filter (
123+ n => n . nodeType === 1 && n . namespaceURI === NS_W && n . localName === 'p'
124+ ) ;
125+ for ( const outerP of outerParagraphs ) {
126+ const innerParagraphs = Array . from ( outerP . childNodes ) . filter (
127+ n => n . nodeType === 1 && n . namespaceURI === NS_W && n . localName === 'p'
128+ ) ;
129+ for ( const innerP of innerParagraphs ) {
130+ // Move all children of the inner <w:p> into the parent <w:tc>, before the outer <w:p>
131+ // Then remove the inner <w:p> from the outer <w:p>
132+ // Strategy: promote innerP to be a sibling of outerP in the tc
133+ tc . insertBefore ( innerP , outerP ) ;
134+ fixed ++ ;
135+ }
136+ }
137+ }
138+ if ( fixed > 0 ) log ( `[Sanitize] Fixed ${ fixed } nested w:p element(s) in table cells` ) ;
139+ }
140+
112141// ── Paragraph helpers ──────────────────────────────────
113142function getParagraphText ( paragraph ) {
114143 const textNodes = paragraph . getElementsByTagNameNS ( '*' , 't' ) ;
115144 let text = '' ;
116145 for ( const t of Array . from ( textNodes ) ) text += t . textContent || '' ;
117146 return text ;
118147}
148+
149+ function normalizeWhitespace ( s ) {
150+ return s . replace ( / \s + / g, ' ' ) . trim ( ) ;
151+ }
152+
119153function findParagraphByExactText ( xmlDoc , targetText ) {
120154 const paragraphs = Array . from ( xmlDoc . getElementsByTagNameNS ( '*' , 'p' ) ) ;
121155 const normalizedTarget = targetText . trim ( ) ;
122- return paragraphs . find ( p => getParagraphText ( p ) . trim ( ) === normalizedTarget ) || null ;
156+
157+ // 1. Exact match
158+ const exact = paragraphs . find ( p => getParagraphText ( p ) . trim ( ) === normalizedTarget ) ;
159+ if ( exact ) return exact ;
160+
161+ // 2. Whitespace-normalized match
162+ const normTarget = normalizeWhitespace ( normalizedTarget ) ;
163+ const normMatch = paragraphs . find ( p => normalizeWhitespace ( getParagraphText ( p ) ) === normTarget ) ;
164+ if ( normMatch ) {
165+ log ( `[Fuzzy] Whitespace-normalized match for: "${ normalizedTarget . slice ( 0 , 60 ) } …"` ) ;
166+ return normMatch ;
167+ }
168+
169+ // 3. Target starts with paragraph text (Gemini may have merged multiple paragraphs)
170+ // Find the first paragraph whose full text is a prefix of the target
171+ const startsWithMatch = paragraphs . find ( p => {
172+ const pText = normalizeWhitespace ( getParagraphText ( p ) ) ;
173+ return pText . length > 10 && normTarget . startsWith ( pText ) ;
174+ } ) ;
175+ if ( startsWithMatch ) {
176+ log ( `[Fuzzy] Prefix match (target starts with paragraph): "${ getParagraphText ( startsWithMatch ) . trim ( ) . slice ( 0 , 60 ) } …"` ) ;
177+ return startsWithMatch ;
178+ }
179+
180+ // 4. Paragraph text contains the target or target contains paragraph text
181+ const containsMatch = paragraphs . find ( p => {
182+ const pText = normalizeWhitespace ( getParagraphText ( p ) ) ;
183+ return pText . length > 15 && normTarget . includes ( pText ) ;
184+ } ) ;
185+ if ( containsMatch ) {
186+ log ( `[Fuzzy] Contains match: "${ getParagraphText ( containsMatch ) . trim ( ) . slice ( 0 , 60 ) } …"` ) ;
187+ return containsMatch ;
188+ }
189+
190+ // 5. Best overlap — score each paragraph by shared word count
191+ let bestScore = 0 ;
192+ let bestParagraph = null ;
193+ const targetWords = new Set ( normTarget . toLowerCase ( ) . split ( / \s + / ) . filter ( w => w . length > 2 ) ) ;
194+ for ( const p of paragraphs ) {
195+ const pText = getParagraphText ( p ) . trim ( ) ;
196+ if ( ! pText ) continue ;
197+ const pWords = normalizeWhitespace ( pText ) . toLowerCase ( ) . split ( / \s + / ) . filter ( w => w . length > 2 ) ;
198+ const overlap = pWords . filter ( w => targetWords . has ( w ) ) . length ;
199+ const score = overlap / Math . max ( targetWords . size , 1 ) ;
200+ if ( score > bestScore && score > 0.5 ) {
201+ bestScore = score ;
202+ bestParagraph = p ;
203+ }
204+ }
205+ if ( bestParagraph ) {
206+ log ( `[Fuzzy] Best word-overlap match (${ ( bestScore * 100 ) . toFixed ( 0 ) } %): "${ getParagraphText ( bestParagraph ) . trim ( ) . slice ( 0 , 60 ) } …"` ) ;
207+ return bestParagraph ;
208+ }
209+
210+ return null ;
123211}
124212function createSimpleParagraph ( xmlDoc , text ) {
125213 const p = xmlDoc . createElementNS ( NS_W , 'w:p' ) ;
@@ -409,7 +497,7 @@ function buildSystemInstruction(paragraphs) {
409497 const listing = paragraphs . map ( p => `[P${ p . index } ] ${ p . text } ` ) . join ( '\n' ) ;
410498 return [
411499 'You are a contract review AI assistant. The user has uploaded a document.' ,
412- 'Below is the document content, one line per paragraph , prefixed with [P#]:' ,
500+ 'Below is the document content. Each line is ONE SEPARATE PARAGRAPH , prefixed with [P#]:' ,
413501 '' ,
414502 listing ,
415503 '' ,
@@ -424,16 +512,33 @@ function buildSystemInstruction(paragraphs) {
424512 ' { "type": "highlight", "target": "<exact paragraph text>", "textToHighlight": "<substring to highlight>", "color": "yellow|green|cyan|magenta|blue|red" }' ,
425513 ' { "type": "redline", "target": "<exact paragraph text>", "modified": "<replacement paragraph text>" }' ,
426514 '' ,
427- 'IMPORTANT RULES:' ,
428- '- "target" MUST be the EXACT full paragraph text from the listing above. Copy it verbatim.' ,
429- '- "textToComment" / "textToHighlight" must be an exact substring within that paragraph.' ,
515+ 'CRITICAL TARGETING RULES:' ,
516+ '- Each [P#] line above is a SEPARATE paragraph in the document.' ,
517+ '- "target" MUST be the EXACT text of ONE SINGLE [P#] paragraph. Copy it character-for-character.' ,
518+ '- NEVER include the [P#] prefix in ANY operation field. The [P#] prefix is only a reference label, NOT part of the actual text.' ,
519+ '- NEVER combine or concatenate text from multiple [P#] paragraphs into one target.' ,
520+ '- If you need to modify multiple paragraphs, create a SEPARATE operation for EACH paragraph.' ,
521+ '- "textToComment" / "textToHighlight" must be an exact substring found within that single paragraph.' ,
522+ '' ,
523+ 'OPERATION RULES:' ,
430524 '- Use "comment" to explain issues (best for deviations from market standards).' ,
431525 '- Use "highlight" to draw visual attention to problematic phrases.' ,
432- '- Use "redline" to suggest replacement language.' ,
526+ '- Use "redline" to suggest replacement language for a single paragraph.' ,
527+ '' ,
528+ 'FORMATTING IN REDLINES:' ,
529+ '- The "modified" field in redline operations supports special formatting syntax:' ,
530+ ' - **bold text** → wraps text in bold (use double asterisks)' ,
531+ ' - ++underline text++ → wraps text in underline (use double plus signs)' ,
532+ ' - Bullet lists: start each line with "- " for top-level bullets, " - " for nested bullets' ,
533+ ' - Tables: use markdown table syntax (e.g., "| Col1 | Col2 |\\n|---|---|\\n| val | val |")' ,
534+ '- You CAN apply formatting like bold and underline using redline operations.' ,
535+ '- To underline a title, use: { "type": "redline", "target": "Title Text", "modified": "++Title Text++" }' ,
536+ '- To bold a word, use: { "type": "redline", "target": "Some text here", "modified": "Some **text** here" }' ,
537+ '- To add NEW content before an existing paragraph, use a redline that prepends the new text before the original.' ,
433538 '- You may return an empty array [] if there are no issues.' ,
434539 '- Keep comments concise and actionable.' ,
435540 '- If the user asks about "market standards", focus on: unusual liability caps, atypical indemnification, non-standard termination, unreasonable non-compete, missing limitation of liability, unusual governing law, missing confidentiality, unusual assignment restrictions, non-standard warranty disclaimers, missing force majeure.' ,
436- '- Prefer "comment" operations for explanations and "redline" for suggest replacement language.' ,
541+ '- Prefer "comment" operations for explanations and "redline" for suggesting replacement language.' ,
437542 ] . join ( '\n' ) ;
438543}
439544
@@ -471,6 +576,13 @@ function parseGeminiChatResponse(rawText) {
471576 log ( `[WARN] Could not parse operations JSON: ${ err . message } ` ) ;
472577 }
473578
579+ // Strip [P#] markers that Gemini sometimes includes from the paragraph listing
580+ function stripParagraphMarkers ( text ) {
581+ if ( ! text || typeof text !== 'string' ) return text ;
582+ // Remove leading [P<number>] or [P<number>.<number>] prefixes
583+ return text . replace ( / ^ \[ P \d + (?: \. \d + ) ? \] \s * / g, '' ) . trim ( ) ;
584+ }
585+
474586 // Validate and normalize each operation
475587 operations = operations . filter ( op => {
476588 if ( ! op || ! op . type || ! op . target ) return false ;
@@ -479,6 +591,11 @@ function parseGeminiChatResponse(rawText) {
479591 if ( op . type === 'redline' && ! op . modified ) return false ;
480592 return true ;
481593 } ) . map ( op => {
594+ // Strip [P#] markers from all text fields
595+ op . target = stripParagraphMarkers ( op . target ) ;
596+ if ( op . modified ) op . modified = stripParagraphMarkers ( op . modified ) ;
597+ if ( op . textToComment ) op . textToComment = stripParagraphMarkers ( op . textToComment ) ;
598+ if ( op . textToHighlight ) op . textToHighlight = stripParagraphMarkers ( op . textToHighlight ) ;
482599 if ( op . type === 'highlight' ) {
483600 const c = String ( op . color || '' ) . toLowerCase ( ) ;
484601 op . color = ALLOWED_HIGHLIGHT_COLORS . includes ( c ) ? c : 'yellow' ;
@@ -560,17 +677,24 @@ async function applyChatOperations(zip, operations, author) {
560677 }
561678 }
562679
563- // Normalize and write back
680+ // Normalize, sanitize, and write back
564681 const parser = new DOMParser ( ) ;
565682 const serializer = new XMLSerializer ( ) ;
566683 const finalDoc = parser . parseFromString ( documentXml , 'application/xml' ) ;
567684 normalizeBodySectionOrder ( finalDoc ) ;
685+ sanitizeNestedParagraphs ( finalDoc ) ;
568686 documentXml = serializer . serializeToString ( finalDoc ) ;
569687 zip . file ( 'word/document.xml' , documentXml ) ;
570688
571689 await ensureNumberingArtifacts ( zip , capturedNumberingXml ) ;
572690 for ( const cx of capturedCommentsXml ) await ensureCommentsArtifacts ( zip , cx ) ;
573- await validateOutputDocx ( zip ) ;
691+
692+ try {
693+ await validateOutputDocx ( zip ) ;
694+ } catch ( validationErr ) {
695+ log ( `[WARN] Post-operation validation: ${ validationErr . message } ` ) ;
696+ // Non-fatal — document may still be usable
697+ }
574698
575699 return results ;
576700}
0 commit comments