77
88import json
99import logging
10+ import re
1011from typing import Any , Dict , List , Optional
1112
1213from langchain_core .messages import AIMessage , ToolMessage
@@ -205,6 +206,84 @@ def _result_to_citation(result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
205206 }
206207
207208
209+ def extract_title_from_content (content : Optional [str ], max_length : int = 200 ) -> str :
210+ """
211+ Intelligent title extraction supporting multiple formats.
212+
213+ Priority:
214+ 1. HTML <title> tag
215+ 2. Markdown h1 (# Title)
216+ 3. Markdown h2-h6 (## Title, etc.)
217+ 4. JSON/YAML title field
218+ 5. First substantial non-empty line
219+ 6. "Untitled" as fallback
220+
221+ Args:
222+ content: The content to extract title from (can be None)
223+ max_length: Maximum title length (default: 200)
224+
225+ Returns:
226+ Extracted title or "Untitled"
227+ """
228+ if not content :
229+ return "Untitled"
230+
231+ # 1. Try HTML title tag
232+ html_title_match = re .search (
233+ r'<title[^>]*>([^<]+)</title>' ,
234+ content ,
235+ re .IGNORECASE | re .DOTALL
236+ )
237+ if html_title_match :
238+ title = html_title_match .group (1 ).strip ()
239+ if title :
240+ return title [:max_length ]
241+
242+ # 2. Try Markdown h1 (exact match of only one #)
243+ md_h1_match = re .search (
244+ r'^#{1}\s+(.+?)$' ,
245+ content ,
246+ re .MULTILINE
247+ )
248+ if md_h1_match :
249+ title = md_h1_match .group (1 ).strip ()
250+ if title :
251+ return title [:max_length ]
252+
253+ # 3. Try any Markdown heading (h2-h6)
254+ md_heading_match = re .search (
255+ r'^#{2,6}\s+(.+?)$' ,
256+ content ,
257+ re .MULTILINE
258+ )
259+ if md_heading_match :
260+ title = md_heading_match .group (1 ).strip ()
261+ if title :
262+ return title [:max_length ]
263+
264+ # 4. Try JSON/YAML title field
265+ json_title_match = re .search (
266+ r'"?title"?\s*:\s*["\']?([^"\'\n]+)["\']?' ,
267+ content ,
268+ re .IGNORECASE
269+ )
270+ if json_title_match :
271+ title = json_title_match .group (1 ).strip ()
272+ if title and len (title ) > 3 :
273+ return title [:max_length ]
274+
275+ # 5. First substantial non-empty line
276+ for line in content .split ('\n ' ):
277+ line = line .strip ()
278+ # Skip short lines, code blocks, list items, and separators
279+ if (line and
280+ len (line ) > 10 and
281+ not line .startswith (('```' , '---' , '***' , '- ' , '* ' , '+ ' , '#' ))):
282+ return line [:max_length ]
283+
284+ return "Untitled"
285+
286+
208287def _extract_from_crawl_result (data : Any ) -> Optional [Dict [str , Any ]]:
209288 """
210289 Extract citation from crawl tool result.
@@ -224,18 +303,8 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
224303
225304 content = data .get ("crawled_content" , "" )
226305
227- # Try to extract title from content (first h1 or first line)
228- title = "Untitled"
229- if content :
230- lines = content .strip ().split ("\n " )
231- for line in lines :
232- line = line .strip ()
233- if line .startswith ("# " ):
234- title = line [2 :].strip ()
235- break
236- elif line and not line .startswith ("#" ):
237- title = line [:100 ]
238- break
306+ # Extract title using intelligent extraction function
307+ title = extract_title_from_content (content )
239308
240309 return {
241310 "url" : url ,
@@ -248,15 +317,48 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
248317 }
249318
250319
251- def _extract_domain (url : str ) -> str :
252- """Extract domain from URL."""
320+ def _extract_domain (url : Optional [str ]) -> str :
321+ """
322+ Extract domain from URL using urllib with regex fallback.
323+
324+ Handles:
325+ - Standard URLs: https://www.example.com/path
326+ - Short URLs: example.com
327+ - Invalid URLs: graceful fallback
328+
329+ Args:
330+ url: The URL string to extract domain from (can be None)
331+
332+ Returns:
333+ The domain netloc (including port if present), or empty string if extraction fails
334+ """
335+ if not url :
336+ return ""
337+
338+ # Approach 1: Try urllib first (fast path for standard URLs)
253339 try :
254340 from urllib .parse import urlparse
255-
341+
256342 parsed = urlparse (url )
257- return parsed .netloc
258- except Exception :
259- return ""
343+ if parsed .netloc :
344+ return parsed .netloc
345+ except Exception as e :
346+ logger .debug (f"URL parsing failed for { url } : { e } " )
347+
348+ # Approach 2: Regex fallback (for non-standard or bare URLs without scheme)
349+ # Matches: domain[:port] where domain is a valid hostname
350+ # Pattern breakdown:
351+ # ([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)
352+ # - domain labels separated by dots, each 1-63 chars, starting/ending with alphanumeric
353+ # (?::\d+)? - optional port
354+ pattern = r'^([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*(?::\d+)?)(?:[/?#]|$)'
355+
356+ match = re .match (pattern , url )
357+ if match :
358+ return match .group (1 )
359+
360+ logger .warning (f"Could not extract domain from URL: { url } " )
361+ return ""
260362
261363
262364def merge_citations (
0 commit comments