Juicepyter/text-cleaner/text_cleaning_pipeline.py

"""Reusable text-cleaning pipeline for Pokemon descriptions.

This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API:
- no input() calls
- no print side effects
- deterministic output for a given input
"""

from __future__ import annotations

import re
import string
from typing import Any, Dict, List

SLANG_LOOKUP: Dict[str, str] = {
    "n": "and",
    "luv": "love",
    "r": "are",
    "u": "you",
    "ur": "your",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "pokemons": "pokemon",
    "pokmons": "pokemon",
    "bcz": "because",
}

_NLTK_RESOURCES = [
    "punkt",
    "punkt_tab",
    "stopwords",
    "wordnet",
    "averaged_perceptron_tagger",
    "averaged_perceptron_tagger_eng",
]


def _import_nltk() -> Any:
    """Import NLTK lazily so this module can be imported before deps are installed."""
    try:
        import nltk  # type: ignore
    except ModuleNotFoundError as exc:
        raise RuntimeError(
            "NLTK is not installed. Install project dependencies with: pip install -r requirements.txt"
        ) from exc
    return nltk


def ensure_nltk_resources(quiet: bool = True) -> None:
    """Download required NLTK resources if missing.

    Safe to call at app startup (including inside Streamlit).
    """
    nltk = _import_nltk()
    for resource in _NLTK_RESOURCES:
        try:
            nltk.download(resource, quiet=quiet)
        except Exception as exc:
            raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc


def remove_punctuation(text: str) -> str:
    mapping_table = text.maketrans("", "", string.punctuation)
    return text.translate(mapping_table)


def remove_special_chars(text: str) -> str:
    text = text.encode("ascii", "ignore").decode("ascii")
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip()


def remove_short_words(text: str, min_len: int = 3) -> str:
    return " ".join(word for word in text.split() if len(word) >= min_len)


def remove_alphanum_words(text: str) -> str:
    words = text.split()
    cleaned = [
        word
        for word in words
        if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word))
    ]
    return " ".join(cleaned)


def standardize(text: str, lookup: Dict[str, str] | None = None) -> str:
    mapping = lookup or SLANG_LOOKUP
    return " ".join(mapping.get(word, word) for word in text.split())


def _get_wordnet_pos(treebank_tag: str) -> str:
    nltk = _import_nltk()
    wordnet = nltk.corpus.wordnet
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    if treebank_tag.startswith("V"):
        return wordnet.VERB
    if treebank_tag.startswith("N"):
        return wordnet.NOUN
    if treebank_tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN


def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]:
    """Run the full cleaning pipeline and return intermediate + final outputs.

    Returns a dictionary so a UI can display each stage if desired.
    """
    if not isinstance(raw_text, str):
        raise TypeError("raw_text must be a string")

    nltk = _import_nltk()
    pos_tag = nltk.pos_tag
    word_tokenize = nltk.word_tokenize
    stopwords = nltk.corpus.stopwords
    WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer

    ensure_nltk_resources(quiet=True)

    text = raw_text.lower()
    text = remove_punctuation(text)
    text = remove_alphanum_words(text)
    text = remove_special_chars(text)
    noise_removed = remove_short_words(text, min_len=min_len)

    standardized = standardize(noise_removed)

    tokens = word_tokenize(standardized)

    stop_words = set(stopwords.words("english"))
    tokens_no_stopwords = [token for token in tokens if token not in stop_words]

    lem = WordNetLemmatizer()
    pos_tags = pos_tag(tokens_no_stopwords)
    lemmas = [
        lem.lemmatize(token, _get_wordnet_pos(tag))
        for token, tag in pos_tags
    ]

    clean_text = " ".join(lemmas)

    return {
        "raw_text": raw_text,
        "noise_removed": noise_removed,
        "standardized": standardized,
        "tokens": tokens,
        "tokens_no_stopwords": tokens_no_stopwords,
        "lemmas": lemmas,
        "clean_text": clean_text,
    }


def get_clean_text(raw_text: str, min_len: int = 3) -> str:
    """Small helper for app code that only needs the final cleaned text."""
    return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]