@@ -310,14 +310,17 @@ def seg(self, sent, standard_name=False, stopwords=None, return_sent=False):
310310 else :
311311 return result
312312
313- def cut_sentences (self , para ): # 分句
314- para = re .sub ('([。!?\?])([^”])' , r"\1\n\2" , para ) # 单字符断句符
315- para = re .sub ('(\.{6})([^”])' , r"\1\n\2" , para ) # 英文省略号
316- para = re .sub ('(\…{2})([^”])' , r"\1\n\2" , para ) # 中文省略号
317- para = re .sub ('(”) ' , '” \n ' , para ) # 把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
313+ def cut_sentences (self , para , drop_empty_line = True ): # 分句
314+ para = re .sub ('([。!?\?])([^”’ ])' , r"\1\n\2" , para ) # 单字符断句符
315+ para = re .sub ('(\.{6})([^”’ ])' , r"\1\n\2" , para ) # 英文省略号
316+ para = re .sub ('(\…{2})([^”’ ])' , r"\1\n\2" , para ) # 中文省略号
317+ para = re .sub ('([。!?\?][”’])([^,。!?\?]) ' , r'\1\n\2 ' , para ) # 把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
318318 para = para .rstrip () # 段尾如果有多余的\n就去掉它
319319 # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
320- return para .split ("\n " )
320+ sentences = para .split ("\n " )
321+ if drop_empty_line :
322+ sentences = [sent for sent in sentences if len (sent .strip ()) > 0 ]
323+ return sentences
321324
322325 def clear (self ):
323326 self .deprepare ()
@@ -492,13 +495,16 @@ def search_entity(self, query, docs, inv_index):
492495 #
493496 # 文本摘要模块
494497 #
495- def get_summary (self , docs , topK = 5 , with_importance = False , standard_name = True ):
498+ def get_summary (self , docs , topK = 5 , stopwords = None , with_importance = False , standard_name = True ):
496499 import networkx as nx
497500 def sent_sim1 (words1 , words2 ):
501+ if len (words1 ) <= 1 or len (words2 ) <= 1 :
502+ return 0.0
498503 return (len (set (words1 ) & set (words2 ))) / (np .log2 (len (words1 )) + np .log2 (len (words2 )))
499504
500505 # 使用standard_name,相似度可以基于实体链接的结果计算而更加准确
501- sents = [self .seg (doc , standard_name = standard_name ) for doc in docs ]
506+ sents = [self .seg (doc .strip (), standard_name = standard_name , stopwords = stopwords ) for doc in docs ]
507+ sents = [sent for sent in sents if len (sent ) > 0 ]
502508 G = nx .Graph ()
503509 for u , v in combinations (range (len (sents )), 2 ):
504510 G .add_edge (u , v , weight = sent_sim1 (sents [u ], sents [v ]))
@@ -513,7 +519,7 @@ def sent_sim1(words1, words2):
513519 #
514520 # 实体网络模块
515521 #
516- def build_entity_graph (self , docs , inv_index = {}, used_types = []):
522+ def build_entity_graph (self , docs , min_freq = 0 , inv_index = {}, used_types = []):
517523 import networkx as nx
518524 G = nx .Graph ()
519525 links = {}
@@ -541,7 +547,8 @@ def build_entity_graph(self, docs, inv_index={}, used_types=[]):
541547 if len (ids ) > 0 :
542548 links [pair0 ] = len (ids )
543549 for (u , v ) in links :
544- G .add_edge (u , v , weight = links [(u , v )])
550+ if links [(u , v )] >= min_freq :
551+ G .add_edge (u , v , weight = links [(u , v )])
545552 self .entity_graph = G
546553 return G
547554
@@ -587,50 +594,40 @@ def build_word_ego_graph(self, docs, word, standard_name=True, min_freq=0, other
587594 G = G .subgraph (used_nodes ).copy ()
588595 return G
589596
590- def build_entity_ego_graph (self , docs , word , min_freq = 0 , inv_index = {}, used_types = []):
597+ def build_entity_ego_graph (self , docs , word , min_freq = 0 , other_min_freq = - 1 , inv_index = {}, used_types = []):
591598 '''
592599 Entity only version of build_word_ego_graph()
593- :param docs:
594- :param word:
595- :param min_freq:
596- :param inv_index:
597- :param used_types:
598- :return:
599-
600600 '''
601601 import networkx as nx
602602 G = nx .Graph ()
603603 links = {}
604- if len (inv_index ) == 0 :
604+ if other_min_freq == - 1 :
605+ other_min_freq = min_freq
606+ if len (inv_index ) != 0 :
605607 related_docs = self .search_entity (word , docs , inv_index )
606- for i , sent in enumerate (related_docs ):
607- entities_info = self .entity_linking (sent )
608- if len (used_types ) == 0 :
609- entities = set (entity for span , (entity , type0 ) in entities_info )
610- else :
611- entities = set (entity for span , (entity , type0 ) in entities_info if type0 [1 :- 1 ] in used_types )
612- for u , v in combinations (entities , 2 ):
613- pair0 = tuple (sorted ((u , v )))
614- if pair0 not in links :
615- links [pair0 ] = 1
616- else :
617- links [pair0 ] += 1
618- else : # 已经有倒排文档,可以更快速检索
608+ else :
609+ related_docs = [doc for doc in docs if word in self .entity_linking (doc ,standard_name = True )]
610+
611+ for i , sent in enumerate (related_docs ):
612+ entities_info = self .entity_linking (sent )
619613 if len (used_types ) == 0 :
620- entities = self . entity_type_dict . keys ( )
614+ entities = set ( entity for span , ( entity , type0 ) in entities_info )
621615 else :
622- entities = iter (entity for (entity , type0 ) in self . entity_type_dict . items () if type0 in used_types )
616+ entities = set (entity for span , (entity , type0 ) in entities_info if type0 [ 1 : - 1 ] in used_types )
623617 for u , v in combinations (entities , 2 ):
624- if u != word and v != word : # 因为是以限定词语为中心的图,所以其中必须包括限定词
625- continue
626618 pair0 = tuple (sorted ((u , v )))
627- ids = inv_index [u ] & inv_index [v ]
628- if len (ids ) > 0 :
629- links [pair0 ] = len (ids )
619+ if pair0 not in links :
620+ links [pair0 ] = 1
621+ else :
622+ links [pair0 ] += 1
623+
630624 used_nodes = set ([word ]) # 关系对中涉及的词语必须与实体有关(>= min_freq)
631625 for (u , v ) in links :
632- if word in (u , v ) and links [(u , v )] >= min_freq :
626+ w = links [(u , v )]
627+ if word in (u , v ) and w >= min_freq :
633628 used_nodes .add (v if word == u else u )
634- G .add_edge (u , v , weight = links [(u , v )])
629+ G .add_edge (u , v , weight = w )
630+ elif w >= other_min_freq :
631+ G .add_edge (u , v , weight = w )
635632 G = G .subgraph (used_nodes ).copy ()
636633 return G
0 commit comments