first commit

2026-03-19 18:16:20 +01:00
commit 584b2e07b4
34 changed files with 4381 additions and 0 deletions
--- a/text-cleaner/.ipynb_checkpoints/text_cleaning_pipeline-checkpoint.py
+++ b/text-cleaner/.ipynb_checkpoints/text_cleaning_pipeline-checkpoint.py
@@ -0,0 +1,158 @@
+"""Reusable text-cleaning pipeline for Pokemon descriptions.
+
+This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API:
+- no input() calls
+- no print side effects
+- deterministic output for a given input
+"""
+
+from __future__ import annotations
+
+import re
+import string
+from typing import Any, Dict, List
+
+SLANG_LOOKUP: Dict[str, str] = {
+    "n": "and",
+    "luv": "love",
+    "r": "are",
+    "u": "you",
+    "ur": "your",
+    "gonna": "going to",
+    "wanna": "want to",
+    "gotta": "got to",
+    "pokemons": "pokemon",
+    "pokmons": "pokemon",
+    "bcz": "because",
+}
+
+_NLTK_RESOURCES = [
+    "punkt",
+    "punkt_tab",
+    "stopwords",
+    "wordnet",
+    "averaged_perceptron_tagger",
+    "averaged_perceptron_tagger_eng",
+]
+
+
+def _import_nltk() -> Any:
+    """Import NLTK lazily so this module can be imported before deps are installed."""
+    try:
+        import nltk  # type: ignore
+    except ModuleNotFoundError as exc:
+        raise RuntimeError(
+            "NLTK is not installed. Install project dependencies with: pip install -r requirements.txt"
+        ) from exc
+    return nltk
+
+
+def ensure_nltk_resources(quiet: bool = True) -> None:
+    """Download required NLTK resources if missing.
+
+    Safe to call at app startup (including inside Streamlit).
+    """
+    nltk = _import_nltk()
+    for resource in _NLTK_RESOURCES:
+        try:
+            nltk.download(resource, quiet=quiet)
+        except Exception as exc:
+            raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc
+
+
+def remove_punctuation(text: str) -> str:
+    mapping_table = text.maketrans("", "", string.punctuation)
+    return text.translate(mapping_table)
+
+
+def remove_special_chars(text: str) -> str:
+    text = text.encode("ascii", "ignore").decode("ascii")
+    text = re.sub(r"[^a-zA-Z\s]", " ", text)
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def remove_short_words(text: str, min_len: int = 3) -> str:
+    return " ".join(word for word in text.split() if len(word) >= min_len)
+
+
+def remove_alphanum_words(text: str) -> str:
+    words = text.split()
+    cleaned = [
+        word
+        for word in words
+        if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word))
+    ]
+    return " ".join(cleaned)
+
+
+def standardize(text: str, lookup: Dict[str, str] | None = None) -> str:
+    mapping = lookup or SLANG_LOOKUP
+    return " ".join(mapping.get(word, word) for word in text.split())
+
+
+def _get_wordnet_pos(treebank_tag: str) -> str:
+    nltk = _import_nltk()
+    wordnet = nltk.corpus.wordnet
+    if treebank_tag.startswith("J"):
+        return wordnet.ADJ
+    if treebank_tag.startswith("V"):
+        return wordnet.VERB
+    if treebank_tag.startswith("N"):
+        return wordnet.NOUN
+    if treebank_tag.startswith("R"):
+        return wordnet.ADV
+    return wordnet.NOUN
+
+
+def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]:
+    """Run the full cleaning pipeline and return intermediate + final outputs.
+
+    Returns a dictionary so a UI can display each stage if desired.
+    """
+    if not isinstance(raw_text, str):
+        raise TypeError("raw_text must be a string")
+
+    nltk = _import_nltk()
+    pos_tag = nltk.pos_tag
+    word_tokenize = nltk.word_tokenize
+    stopwords = nltk.corpus.stopwords
+    WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer
+
+    ensure_nltk_resources(quiet=True)
+
+    text = raw_text.lower()
+    text = remove_punctuation(text)
+    text = remove_alphanum_words(text)
+    text = remove_special_chars(text)
+    noise_removed = remove_short_words(text, min_len=min_len)
+
+    standardized = standardize(noise_removed)
+
+    tokens = word_tokenize(standardized)
+
+    stop_words = set(stopwords.words("english"))
+    tokens_no_stopwords = [token for token in tokens if token not in stop_words]
+
+    lem = WordNetLemmatizer()
+    pos_tags = pos_tag(tokens_no_stopwords)
+    lemmas = [
+        lem.lemmatize(token, _get_wordnet_pos(tag))
+        for token, tag in pos_tags
+    ]
+
+    clean_text = " ".join(lemmas)
+
+    return {
+        "raw_text": raw_text,
+        "noise_removed": noise_removed,
+        "standardized": standardized,
+        "tokens": tokens,
+        "tokens_no_stopwords": tokens_no_stopwords,
+        "lemmas": lemmas,
+        "clean_text": clean_text,
+    }
+
+
+def get_clean_text(raw_text: str, min_len: int = 3) -> str:
+    """Small helper for app code that only needs the final cleaned text."""
+    return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]