Skip to content

Commit 9d871ed

Browse files
author
blmoistawinde
committed
V0.4.1
some bug fixed. Resources adjusted.
1 parent 0c66813 commit 9d871ed

5 files changed

Lines changed: 46 additions & 44 deletions

File tree

examples/basics.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,11 @@ def build_word_ego_graph():
200200
nx.draw(G,pos)
201201
nx.draw_networkx_labels(G,pos)
202202
plt.show()
203+
G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2)
204+
pos = nx.kamada_kawai_layout(G)
205+
nx.draw(G, pos)
206+
nx.draw_networkx_labels(G, pos)
207+
plt.show()
203208

204209
def using_typed_words():
205210
from harvesttext.resources import get_qh_typed_words,get_baidu_stopwords

harvesttext/harvesttext.py

Lines changed: 38 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -310,14 +310,17 @@ def seg(self, sent, standard_name=False, stopwords=None, return_sent=False):
310310
else:
311311
return result
312312

313-
def cut_sentences(self, para): # 分句
314-
para = re.sub('([。!?\?])([^”])', r"\1\n\2", para) # 单字符断句符
315-
para = re.sub('(\.{6})([^”])', r"\1\n\2", para) # 英文省略号
316-
para = re.sub('(\…{2})([^”])', r"\1\n\2", para) # 中文省略号
317-
para = re.sub('(”)', '”\n', para) # 把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
313+
def cut_sentences(self, para, drop_empty_line = True): # 分句
314+
para = re.sub('([。!?\?])([^”])', r"\1\n\2", para) # 单字符断句符
315+
para = re.sub('(\.{6})([^”])', r"\1\n\2", para) # 英文省略号
316+
para = re.sub('(\…{2})([^”])', r"\1\n\2", para) # 中文省略号
317+
para = re.sub('([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para) # 把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
318318
para = para.rstrip() # 段尾如果有多余的\n就去掉它
319319
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
320-
return para.split("\n")
320+
sentences = para.split("\n")
321+
if drop_empty_line:
322+
sentences = [sent for sent in sentences if len(sent.strip()) > 0]
323+
return sentences
321324

322325
def clear(self):
323326
self.deprepare()
@@ -492,13 +495,16 @@ def search_entity(self, query, docs, inv_index):
492495
#
493496
# 文本摘要模块
494497
#
495-
def get_summary(self, docs, topK=5, with_importance=False, standard_name=True):
498+
def get_summary(self, docs, topK=5, stopwords=None, with_importance=False, standard_name=True):
496499
import networkx as nx
497500
def sent_sim1(words1, words2):
501+
if len(words1) <= 1 or len(words2) <= 1:
502+
return 0.0
498503
return (len(set(words1) & set(words2))) / (np.log2(len(words1)) + np.log2(len(words2)))
499504

500505
# 使用standard_name,相似度可以基于实体链接的结果计算而更加准确
501-
sents = [self.seg(doc, standard_name=standard_name) for doc in docs]
506+
sents = [self.seg(doc.strip(), standard_name=standard_name, stopwords=stopwords) for doc in docs]
507+
sents = [sent for sent in sents if len(sent) > 0]
502508
G = nx.Graph()
503509
for u, v in combinations(range(len(sents)), 2):
504510
G.add_edge(u, v, weight=sent_sim1(sents[u], sents[v]))
@@ -513,7 +519,7 @@ def sent_sim1(words1, words2):
513519
#
514520
# 实体网络模块
515521
#
516-
def build_entity_graph(self, docs, inv_index={}, used_types=[]):
522+
def build_entity_graph(self, docs, min_freq=0, inv_index={}, used_types=[]):
517523
import networkx as nx
518524
G = nx.Graph()
519525
links = {}
@@ -541,7 +547,8 @@ def build_entity_graph(self, docs, inv_index={}, used_types=[]):
541547
if len(ids) > 0:
542548
links[pair0] = len(ids)
543549
for (u, v) in links:
544-
G.add_edge(u, v, weight=links[(u, v)])
550+
if links[(u, v)] >= min_freq:
551+
G.add_edge(u, v, weight=links[(u, v)])
545552
self.entity_graph = G
546553
return G
547554

@@ -587,50 +594,40 @@ def build_word_ego_graph(self, docs, word, standard_name=True, min_freq=0, other
587594
G = G.subgraph(used_nodes).copy()
588595
return G
589596

590-
def build_entity_ego_graph(self, docs, word, min_freq=0, inv_index={}, used_types=[]):
597+
def build_entity_ego_graph(self, docs, word, min_freq=0, other_min_freq=-1, inv_index={}, used_types=[]):
591598
'''
592599
Entity only version of build_word_ego_graph()
593-
:param docs:
594-
:param word:
595-
:param min_freq:
596-
:param inv_index:
597-
:param used_types:
598-
:return:
599-
600600
'''
601601
import networkx as nx
602602
G = nx.Graph()
603603
links = {}
604-
if len(inv_index) == 0:
604+
if other_min_freq == -1:
605+
other_min_freq = min_freq
606+
if len(inv_index) != 0:
605607
related_docs = self.search_entity(word, docs, inv_index)
606-
for i, sent in enumerate(related_docs):
607-
entities_info = self.entity_linking(sent)
608-
if len(used_types) == 0:
609-
entities = set(entity for span, (entity, type0) in entities_info)
610-
else:
611-
entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types)
612-
for u, v in combinations(entities, 2):
613-
pair0 = tuple(sorted((u, v)))
614-
if pair0 not in links:
615-
links[pair0] = 1
616-
else:
617-
links[pair0] += 1
618-
else: # 已经有倒排文档,可以更快速检索
608+
else:
609+
related_docs = [doc for doc in docs if word in self.entity_linking(doc,standard_name=True)]
610+
611+
for i, sent in enumerate(related_docs):
612+
entities_info = self.entity_linking(sent)
619613
if len(used_types) == 0:
620-
entities = self.entity_type_dict.keys()
614+
entities = set(entity for span, (entity, type0) in entities_info)
621615
else:
622-
entities = iter(entity for (entity, type0) in self.entity_type_dict.items() if type0 in used_types)
616+
entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types)
623617
for u, v in combinations(entities, 2):
624-
if u != word and v != word: # 因为是以限定词语为中心的图,所以其中必须包括限定词
625-
continue
626618
pair0 = tuple(sorted((u, v)))
627-
ids = inv_index[u] & inv_index[v]
628-
if len(ids) > 0:
629-
links[pair0] = len(ids)
619+
if pair0 not in links:
620+
links[pair0] = 1
621+
else:
622+
links[pair0] += 1
623+
630624
used_nodes = set([word]) # 关系对中涉及的词语必须与实体有关(>= min_freq)
631625
for (u, v) in links:
632-
if word in (u, v) and links[(u, v)] >= min_freq:
626+
w = links[(u, v)]
627+
if word in (u, v) and w >= min_freq:
633628
used_nodes.add(v if word == u else u)
634-
G.add_edge(u, v, weight=links[(u, v)])
629+
G.add_edge(u, v, weight=w)
630+
elif w >= other_min_freq:
631+
G.add_edge(u, v, weight=w)
635632
G = G.subgraph(used_nodes).copy()
636633
return G

harvesttext/resources/sanguo_entity_dict.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)