Juicepyter/clean-text-to-keywords/test_keyword_extractor.py

import unittest

from keyword_extractor import KeywordExtractor


class FakeToken:
    def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
        self.text = text
        self.pos_ = pos
        self.lemma_ = lemma
        self.is_stop = is_stop
        self.is_punct = not any(ch.isalnum() for ch in text)


class FakeNLP:
    def __init__(self, tag_map, stopwords) -> None:
        self.tag_map = tag_map
        self.stopwords = stopwords

    def __call__(self, text: str):
        tokens = []
        for raw in text.split():
            token_text = raw.strip()
            lowered = token_text.lower()
            tokens.append(
                FakeToken(
                    text=token_text,
                    pos=self.tag_map.get(lowered, "NOUN"),
                    lemma=lowered,
                    is_stop=lowered in self.stopwords,
                )
            )
        return tokens


class TestableKeywordExtractor(KeywordExtractor):
    def __init__(self, *args, yake_scores=None, **kwargs):
        super().__init__(*args, **kwargs)
        self._test_yake_scores = yake_scores or {}

    def _extract_yake_scores(self, text: str):
        return self._test_yake_scores


class KeywordExtractorTests(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        tag_map = {
            "fiery": "ADJ",
            "dragon": "NOUN",
            "attack": "VERB",
            "explosive": "ADJ",
            "flames": "NOUN",
            "burning": "ADJ",
            "creature": "NOUN",
            "with": "ADP",
            "blaze": "NOUN",
            "and": "CCONJ",
            "dangerous": "ADJ",
            "electric": "ADJ",
            "mouse": "NOUN",
            "using": "VERB",
            "thunder": "NOUN",
            "shock": "NOUN",
            "strong": "ADJ",
            "furret": "NOUN",
            "long": "ADJ",
            "slender": "ADJ",
            "soft": "ADJ",
            "fur": "NOUN",
            "flexible": "ADJ",
            "body": "NOUN",
            "move": "VERB",
            "gracefully": "ADJ",
            "narrow": "ADJ",
            "tunnel": "NOUN",
            "tail": "NOUN",
            "smash": "VERB",
            "opponent": "NOUN",
            "battle": "NOUN",
            "cheerful": "ADJ",
            "endurance": "NOUN",
        }

        stopwords = {
            "a",
            "very",
            "and",
            "with",
            "the",
            "it",
            "to",
            "its",
            "that",
            "through",
            "in",
        }
        cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
        cls.extractor = KeywordExtractor(nlp=cls.nlp, use_yake=False)

    def test_readme_main_example(self) -> None:
        text = "fiery dragon attack explosive flames"
        result = self.extractor.extract(text)
        self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])

    def test_synonym_normalization(self) -> None:
        text = "burning creature with blaze power"
        result = self.extractor.extract(text)
        self.assertEqual(result, ["fire", "creature", "power"])

    def test_mixed_types(self) -> None:
        text = "electric mouse using thunder shock"
        result = self.extractor.extract(text)
        self.assertEqual(result, ["electric", "mouse", "using"])

    def test_noise_input(self) -> None:
        text = "a very very strong and dangerous creature"
        result = self.extractor.extract(text)
        self.assertEqual(result, ["strong", "dangerous", "creature"])

    def test_yake_keeps_detailed_information(self) -> None:
        text = (
            "furret long slender creature soft fur flexible body move gracefully narrow tunnel "
            "tail smash opponent battle cheerful endurance"
        )

        yake_scores = {
            "furret": 0.00,
            "creature": 0.05,
            "tail": 0.08,
            "battle": 0.10,
            "smash": 0.12,
            "tunnel": 0.14,
            "endurance": 0.18,
            "body": 0.20,
            "cheerful": 0.22,
            "slender": 0.26,
            "flexible": 0.28,
            "gracefully": 0.34,
            "narrow": 0.40,
            "long": 0.42,
            "soft": 0.44,
            "fur": 0.45,
            "move": 0.48,
            "opponent": 0.52,
        }
        extractor = TestableKeywordExtractor(
            nlp=self.nlp,
            use_yake=True,
            keep_ratio=0.8,
            min_keywords=10,
            max_keywords=30,
            yake_scores=yake_scores,
        )

        result = extractor.extract(text)

        self.assertGreaterEqual(len(result), 10)
        self.assertIn("furret", result)
        self.assertIn("creature", result)
        self.assertIn("tail", result)
        self.assertIn("tunnel", result)


if __name__ == "__main__":
    unittest.main()