"""Reusable text-cleaning pipeline for Pokemon descriptions. This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API: - no input() calls - no print side effects - deterministic output for a given input """ from __future__ import annotations import re import string from typing import Any, Dict, List SLANG_LOOKUP: Dict[str, str] = { "n": "and", "luv": "love", "r": "are", "u": "you", "ur": "your", "gonna": "going to", "wanna": "want to", "gotta": "got to", "pokemons": "pokemon", "pokmons": "pokemon", "bcz": "because", } _NLTK_RESOURCES = [ "punkt", "punkt_tab", "stopwords", "wordnet", "averaged_perceptron_tagger", "averaged_perceptron_tagger_eng", ] def _import_nltk() -> Any: """Import NLTK lazily so this module can be imported before deps are installed.""" try: import nltk # type: ignore except ModuleNotFoundError as exc: raise RuntimeError( "NLTK is not installed. Install project dependencies with: pip install -r requirements.txt" ) from exc return nltk def ensure_nltk_resources(quiet: bool = True) -> None: """Download required NLTK resources if missing. Safe to call at app startup (including inside Streamlit). """ nltk = _import_nltk() for resource in _NLTK_RESOURCES: try: nltk.download(resource, quiet=quiet) except Exception as exc: raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc def remove_punctuation(text: str) -> str: mapping_table = text.maketrans("", "", string.punctuation) return text.translate(mapping_table) def remove_special_chars(text: str) -> str: text = text.encode("ascii", "ignore").decode("ascii") text = re.sub(r"[^a-zA-Z\s]", " ", text) return re.sub(r"\s+", " ", text).strip() def remove_short_words(text: str, min_len: int = 3) -> str: return " ".join(word for word in text.split() if len(word) >= min_len) def remove_alphanum_words(text: str) -> str: words = text.split() cleaned = [ word for word in words if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word)) ] return " ".join(cleaned) def standardize(text: str, lookup: Dict[str, str] | None = None) -> str: mapping = lookup or SLANG_LOOKUP return " ".join(mapping.get(word, word) for word in text.split()) def _get_wordnet_pos(treebank_tag: str) -> str: nltk = _import_nltk() wordnet = nltk.corpus.wordnet if treebank_tag.startswith("J"): return wordnet.ADJ if treebank_tag.startswith("V"): return wordnet.VERB if treebank_tag.startswith("N"): return wordnet.NOUN if treebank_tag.startswith("R"): return wordnet.ADV return wordnet.NOUN def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]: """Run the full cleaning pipeline and return intermediate + final outputs. Returns a dictionary so a UI can display each stage if desired. """ if not isinstance(raw_text, str): raise TypeError("raw_text must be a string") nltk = _import_nltk() pos_tag = nltk.pos_tag word_tokenize = nltk.word_tokenize stopwords = nltk.corpus.stopwords WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer ensure_nltk_resources(quiet=True) text = raw_text.lower() text = remove_punctuation(text) text = remove_alphanum_words(text) text = remove_special_chars(text) noise_removed = remove_short_words(text, min_len=min_len) standardized = standardize(noise_removed) tokens = word_tokenize(standardized) stop_words = set(stopwords.words("english")) tokens_no_stopwords = [token for token in tokens if token not in stop_words] lem = WordNetLemmatizer() pos_tags = pos_tag(tokens_no_stopwords) lemmas = [ lem.lemmatize(token, _get_wordnet_pos(tag)) for token, tag in pos_tags ] clean_text = " ".join(lemmas) return { "raw_text": raw_text, "noise_removed": noise_removed, "standardized": standardized, "tokens": tokens, "tokens_no_stopwords": tokens_no_stopwords, "lemmas": lemmas, "clean_text": clean_text, } def get_clean_text(raw_text: str, min_len: int = 3) -> str: """Small helper for app code that only needs the final cleaned text.""" return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]