167 lines
4.8 KiB
Python
167 lines
4.8 KiB
Python
import unittest
|
|
|
|
from keyword_extractor import KeywordExtractor
|
|
|
|
|
|
class FakeToken:
|
|
def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
|
|
self.text = text
|
|
self.pos_ = pos
|
|
self.lemma_ = lemma
|
|
self.is_stop = is_stop
|
|
self.is_punct = not any(ch.isalnum() for ch in text)
|
|
|
|
|
|
class FakeNLP:
|
|
def __init__(self, tag_map, stopwords) -> None:
|
|
self.tag_map = tag_map
|
|
self.stopwords = stopwords
|
|
|
|
def __call__(self, text: str):
|
|
tokens = []
|
|
for raw in text.split():
|
|
token_text = raw.strip()
|
|
lowered = token_text.lower()
|
|
tokens.append(
|
|
FakeToken(
|
|
text=token_text,
|
|
pos=self.tag_map.get(lowered, "NOUN"),
|
|
lemma=lowered,
|
|
is_stop=lowered in self.stopwords,
|
|
)
|
|
)
|
|
return tokens
|
|
|
|
|
|
class TestableKeywordExtractor(KeywordExtractor):
|
|
def __init__(self, *args, yake_scores=None, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self._test_yake_scores = yake_scores or {}
|
|
|
|
def _extract_yake_scores(self, text: str):
|
|
return self._test_yake_scores
|
|
|
|
|
|
class KeywordExtractorTests(unittest.TestCase):
|
|
@classmethod
|
|
def setUpClass(cls) -> None:
|
|
tag_map = {
|
|
"fiery": "ADJ",
|
|
"dragon": "NOUN",
|
|
"attack": "VERB",
|
|
"explosive": "ADJ",
|
|
"flames": "NOUN",
|
|
"burning": "ADJ",
|
|
"creature": "NOUN",
|
|
"with": "ADP",
|
|
"blaze": "NOUN",
|
|
"and": "CCONJ",
|
|
"dangerous": "ADJ",
|
|
"electric": "ADJ",
|
|
"mouse": "NOUN",
|
|
"using": "VERB",
|
|
"thunder": "NOUN",
|
|
"shock": "NOUN",
|
|
"strong": "ADJ",
|
|
"furret": "NOUN",
|
|
"long": "ADJ",
|
|
"slender": "ADJ",
|
|
"soft": "ADJ",
|
|
"fur": "NOUN",
|
|
"flexible": "ADJ",
|
|
"body": "NOUN",
|
|
"move": "VERB",
|
|
"gracefully": "ADJ",
|
|
"narrow": "ADJ",
|
|
"tunnel": "NOUN",
|
|
"tail": "NOUN",
|
|
"smash": "VERB",
|
|
"opponent": "NOUN",
|
|
"battle": "NOUN",
|
|
"cheerful": "ADJ",
|
|
"endurance": "NOUN",
|
|
}
|
|
|
|
stopwords = {
|
|
"a",
|
|
"very",
|
|
"and",
|
|
"with",
|
|
"the",
|
|
"it",
|
|
"to",
|
|
"its",
|
|
"that",
|
|
"through",
|
|
"in",
|
|
}
|
|
cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
|
|
cls.extractor = KeywordExtractor(nlp=cls.nlp, use_yake=False)
|
|
|
|
def test_readme_main_example(self) -> None:
|
|
text = "fiery dragon attack explosive flames"
|
|
result = self.extractor.extract(text)
|
|
self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])
|
|
|
|
def test_synonym_normalization(self) -> None:
|
|
text = "burning creature with blaze power"
|
|
result = self.extractor.extract(text)
|
|
self.assertEqual(result, ["fire", "creature", "power"])
|
|
|
|
def test_mixed_types(self) -> None:
|
|
text = "electric mouse using thunder shock"
|
|
result = self.extractor.extract(text)
|
|
self.assertEqual(result, ["electric", "mouse", "using"])
|
|
|
|
def test_noise_input(self) -> None:
|
|
text = "a very very strong and dangerous creature"
|
|
result = self.extractor.extract(text)
|
|
self.assertEqual(result, ["strong", "dangerous", "creature"])
|
|
|
|
def test_yake_keeps_detailed_information(self) -> None:
|
|
text = (
|
|
"furret long slender creature soft fur flexible body move gracefully narrow tunnel "
|
|
"tail smash opponent battle cheerful endurance"
|
|
)
|
|
|
|
yake_scores = {
|
|
"furret": 0.00,
|
|
"creature": 0.05,
|
|
"tail": 0.08,
|
|
"battle": 0.10,
|
|
"smash": 0.12,
|
|
"tunnel": 0.14,
|
|
"endurance": 0.18,
|
|
"body": 0.20,
|
|
"cheerful": 0.22,
|
|
"slender": 0.26,
|
|
"flexible": 0.28,
|
|
"gracefully": 0.34,
|
|
"narrow": 0.40,
|
|
"long": 0.42,
|
|
"soft": 0.44,
|
|
"fur": 0.45,
|
|
"move": 0.48,
|
|
"opponent": 0.52,
|
|
}
|
|
extractor = TestableKeywordExtractor(
|
|
nlp=self.nlp,
|
|
use_yake=True,
|
|
keep_ratio=0.8,
|
|
min_keywords=10,
|
|
max_keywords=30,
|
|
yake_scores=yake_scores,
|
|
)
|
|
|
|
result = extractor.extract(text)
|
|
|
|
self.assertGreaterEqual(len(result), 10)
|
|
self.assertIn("furret", result)
|
|
self.assertIn("creature", result)
|
|
self.assertIn("tail", result)
|
|
self.assertIn("tunnel", result)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|