-
Notifications
You must be signed in to change notification settings - Fork 0
/
candidatesExtraction.py
81 lines (73 loc) · 4.66 KB
/
candidatesExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import spacy
from spacy.tokens import Doc
from spacy.tokens import Span
from spacy import displacy
from spacy.matcher import Matcher
class candidatesExtraction:
def __init__(self, regular_expression : bool, greedy: str) -> None:
self.natural_language_processor = spacy.load("es_core_news_sm") #importante
self.greedy = greedy
self.regular_expression = regular_expression
def merge_adp_chunks(self, chunk: Span, current_chunk_completed = '') -> None:
current_chunk_completed = current_chunk_completed + chunk.text
start_of_next_chunk = chunk.end + 1
chunk = next(self.chunk_generator, None)
if chunk != None:
if chunk.start == start_of_next_chunk:
token_after_chunk_label = self.processed_text[chunk.end].text
if token_after_chunk_label == 'de':
current_chunk_completed = current_chunk_completed + ' ' + self.processed_text[chunk.start - 1].text + ' '
self.merge_adp_chunks(chunk, current_chunk_completed)
else:
current_chunk_completed = current_chunk_completed + ' ' + self.processed_text[chunk.start - 1].text + ' ' + chunk.text
add_text = self.clean_determinants(current_chunk_completed)
self.keyphrase_candidate.append([add_text, (chunk.end-len(current_chunk_completed.split()), chunk.end)])
return
else:
add_text = self.clean_determinants(current_chunk_completed)
self.keyphrase_candidate.append([add_text, (chunk.end-len(current_chunk_completed.split()), chunk.end)])
token_after_chunk = self.processed_text[chunk.end]
if token_after_chunk.text == 'de':
self.merge_adp_chunks(chunk)
else:
add_text = self.clean_determinants(chunk.text)
self.keyphrase_candidate.append([add_text, (chunk.start, chunk.end)])
return
def clean_determinants(self, text_to_clean: str) -> str:
processor = self.natural_language_processor(text_to_clean)
if(processor[0].pos_ == 'DET' and len(processor) > 1):
text_to_clean = text_to_clean.split(' ', 1)[1]
return text_to_clean
def apped_candidates_matches(self,matcher, doc: Doc, i, matches):
match_id, start, end = matches[i]
candidate_text = doc[start:end]
self.keyphrase_candidate.append(([candidate_text.text, (start, end)]))
def apply_regular_expresion(self):
self.matcher = Matcher(self.natural_language_processor.vocab)
# TODO: Investigar cambiar LOWER IN de con por todos las preposiciones en español que existen
self.pattern = [{"POS":{"IN": ["ADJ", "PROPN", "NOUN"]}, "OP": "+"} , {"LOWER": {"IN": ["de", "con"]}, "OP":"?"}, {"POS": "DET", "OP": "?" } ,{"POS":{"IN": ["ADJ", "PROPN", "NOUN"]}, "OP": "*"}, {"LOWER": {"IN": ["de", "con"]}, "OP":"?"}, {"POS": "DET", "OP": "?" } ,{"POS":{"IN": ["ADJ", "PROPN", "NOUN"]}, "OP": "*"}]
if(self.greedy == "COMBINED"):
self.matcher.add("Candidates", [self.pattern], on_match=self.apped_candidates_matches, greedy="LONGEST")
self.matcher.add("Candidates", [self.pattern], on_match=self.apped_candidates_matches, greedy="FIRST")
elif(self.greedy == "NONE"):
self.matcher.add("Candidates", [self.pattern], on_match=self.apped_candidates_matches)
else:
self.matcher.add("Candidates", [self.pattern], on_match=self.apped_candidates_matches, greedy=self.greedy) # by adding the greedy="LONGEST" wi will get only the longest matches
self.matches = self.matcher(self.processed_text)
def extract_candidates(self, input_text: str, key: str):
self.keyphrase_candidate = []
self.processed_text = self.natural_language_processor(input_text)
if (self.regular_expression == True):
self.apply_regular_expresion()
return self.keyphrase_candidate
else:
self.chunk_generator = self.processed_text.noun_chunks
for chunk in self.chunk_generator:
if chunk.end < len(self.processed_text):
token_after_chunk = self.processed_text[chunk.end]
if token_after_chunk.text == 'de':
self.merge_adp_chunks(chunk)
else:
add_text = self.clean_determinants(chunk.text)
self.keyphrase_candidate.append([add_text, (chunk.start, chunk.end)])
return(self.keyphrase_candidate)