bytedance
diff --git a/‎src/citations/collector.py‎
Lines changed: 10 additions & 5 deletions b/‎src/citations/collector.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎src/citations/extractor.py‎
Lines changed: 120 additions & 18 deletions b/‎src/citations/extractor.py‎
Lines changed: 120 additions & 18 deletions
@@ -28,6 +28,7 @@ def __init__(self):
         self._citations: Dict[str, CitationMetadata] = {}  # url -> metadata
         self._citation_order: List[str] = []  # ordered list of URLs
         self._used_citations: set[str] = set()  # URLs that are actually cited
+        self._url_to_index: Dict[str, int] = {}  # url -> index of _citation_order (O(1) lookup)
 
     def add_from_search_results(
         self, results: List[Dict[str, Any]], query: str = ""
@@ -58,6 +59,7 @@ def add_from_search_results(
             if url not in self._citations:
                 self._citations[url] = metadata
                 self._citation_order.append(url)
+                self._url_to_index[url] = len(self._citation_order) - 1
                 added.append(metadata)
                 logger.debug(f"Added citation: {metadata.title} ({url})")
             else:
@@ -104,6 +106,7 @@ def add_from_crawl_result(
             )
             self._citations[url] = metadata
             self._citation_order.append(url)
+            self._url_to_index[url] = len(self._citation_order) - 1
 
         return metadata
 
@@ -124,18 +127,16 @@ def mark_used(self, url: str) -> Optional[int]:
 
     def get_number(self, url: str) -> Optional[int]:
         """
-        Get the citation number for a URL.
+        Get the citation number for a URL (O(1) time complexity).
 
         Args:
             url: The URL to look up
 
         Returns:
             The citation number (1-indexed) or None if not found
         """
-        try:
-            return self._citation_order.index(url) + 1
-        except ValueError:
-            return None
+        index = self._url_to_index.get(url)
+        return index + 1 if index is not None else None
 
     def get_metadata(self, url: str) -> Optional[CitationMetadata]:
         """
@@ -215,7 +216,9 @@ def from_dict(cls, data: Dict[str, Any]) -> "CitationCollector":
         for citation_data in data.get("citations", []):
             citation = Citation.from_dict(citation_data)
             collector._citations[citation.url] = citation.metadata
+            index = len(collector._citation_order)
             collector._citation_order.append(citation.url)
+            collector._url_to_index[citation.url] = index
         collector._used_citations = set(data.get("used_urls", []))
         return collector
 
@@ -230,6 +233,7 @@ def merge_with(self, other: "CitationCollector") -> None:
             if url not in self._citations:
                 self._citations[url] = other._citations[url]
                 self._citation_order.append(url)
+                self._url_to_index[url] = len(self._citation_order) - 1
         self._used_citations.update(other._used_citations)
 
     @property
@@ -247,6 +251,7 @@ def clear(self) -> None:
         self._citations.clear()
         self._citation_order.clear()
         self._used_citations.clear()
+        self._url_to_index.clear()
 
 
 def extract_urls_from_text(text: str) -> List[str]:
 
@@ -7,6 +7,7 @@
 
 import json
 import logging
+import re
 from typing import Any, Dict, List, Optional
 
 from langchain_core.messages import AIMessage, ToolMessage
@@ -205,6 +206,84 @@ def _result_to_citation(result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
     }
 
 
+def extract_title_from_content(content: Optional[str], max_length: int = 200) -> str:
+    """
+    Intelligent title extraction supporting multiple formats.
+    
+    Priority:
+    1. HTML <title> tag
+    2. Markdown h1 (# Title)
+    3. Markdown h2-h6 (## Title, etc.)
+    4. JSON/YAML title field
+    5. First substantial non-empty line
+    6. "Untitled" as fallback
+    
+    Args:
+        content: The content to extract title from (can be None)
+        max_length: Maximum title length (default: 200)
+    
+    Returns:
+        Extracted title or "Untitled"
+    """
+    if not content:
+        return "Untitled"
+    
+    # 1. Try HTML title tag
+    html_title_match = re.search(
+        r'<title[^>]*>([^<]+)</title>',
+        content,
+        re.IGNORECASE | re.DOTALL
+    )
+    if html_title_match:
+        title = html_title_match.group(1).strip()
+        if title:
+            return title[:max_length]
+    
+    # 2. Try Markdown h1 (exact match of only one #)
+    md_h1_match = re.search(
+        r'^#{1}\s+(.+?)$',
+        content,
+        re.MULTILINE
+    )
+    if md_h1_match:
+        title = md_h1_match.group(1).strip()
+        if title:
+            return title[:max_length]
+    
+    # 3. Try any Markdown heading (h2-h6)
+    md_heading_match = re.search(
+        r'^#{2,6}\s+(.+?)$',
+        content,
+        re.MULTILINE
+    )
+    if md_heading_match:
+        title = md_heading_match.group(1).strip()
+        if title:
+            return title[:max_length]
+    
+    # 4. Try JSON/YAML title field
+    json_title_match = re.search(
+        r'"?title"?\s*:\s*["\']?([^"\'\n]+)["\']?',
+        content,
+        re.IGNORECASE
+    )
+    if json_title_match:
+        title = json_title_match.group(1).strip()
+        if title and len(title) > 3:
+            return title[:max_length]
+    
+    # 5. First substantial non-empty line
+    for line in content.split('\n'):
+        line = line.strip()
+        # Skip short lines, code blocks, list items, and separators
+        if (line and 
+            len(line) > 10 and 
+            not line.startswith(('```', '---', '***', '- ', '* ', '+ ', '#'))):
+            return line[:max_length]
+    
+    return "Untitled"
+
+
 def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
     """
     Extract citation from crawl tool result.
@@ -224,18 +303,8 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
 
     content = data.get("crawled_content", "")
 
-    # Try to extract title from content (first h1 or first line)
-    title = "Untitled"
-    if content:
-        lines = content.strip().split("\n")
-        for line in lines:
-            line = line.strip()
-            if line.startswith("# "):
-                title = line[2:].strip()
-                break
-            elif line and not line.startswith("#"):
-                title = line[:100]
-                break
+    # Extract title using intelligent extraction function
+    title = extract_title_from_content(content)
 
     return {
         "url": url,
@@ -248,15 +317,48 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
     }
 
 
-def _extract_domain(url: str) -> str:
-    """Extract domain from URL."""
+def _extract_domain(url: Optional[str]) -> str:
+    """
+    Extract domain from URL using urllib with regex fallback.
+    
+    Handles:
+    - Standard URLs: https://www.example.com/path
+    - Short URLs: example.com
+    - Invalid URLs: graceful fallback
+    
+    Args:
+        url: The URL string to extract domain from (can be None)
+    
+    Returns:
+        The domain netloc (including port if present), or empty string if extraction fails
+    """
+    if not url:
+        return ""
+    
+    # Approach 1: Try urllib first (fast path for standard URLs)
     try:
         from urllib.parse import urlparse
-
+        
         parsed = urlparse(url)
-        return parsed.netloc
-    except Exception:
-        return ""
+        if parsed.netloc:
+            return parsed.netloc
+    except Exception as e:
+        logger.debug(f"URL parsing failed for {url}: {e}")
+    
+    # Approach 2: Regex fallback (for non-standard or bare URLs without scheme)
+    # Matches: domain[:port] where domain is a valid hostname
+    # Pattern breakdown:
+    # ([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)
+    # - domain labels separated by dots, each 1-63 chars, starting/ending with alphanumeric
+    # (?::\d+)? - optional port
+    pattern = r'^([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*(?::\d+)?)(?:[/?#]|$)'
+    
+    match = re.match(pattern, url)
+    if match:
+        return match.group(1)
+    
+    logger.warning(f"Could not extract domain from URL: {url}")
+    return ""
 
 
 def merge_citations(