159 lines
4.4 KiB
Python
159 lines
4.4 KiB
Python
"""Reusable text-cleaning pipeline for Pokemon descriptions.
|
|
|
|
This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API:
|
|
- no input() calls
|
|
- no print side effects
|
|
- deterministic output for a given input
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import string
|
|
from typing import Any, Dict, List
|
|
|
|
SLANG_LOOKUP: Dict[str, str] = {
|
|
"n": "and",
|
|
"luv": "love",
|
|
"r": "are",
|
|
"u": "you",
|
|
"ur": "your",
|
|
"gonna": "going to",
|
|
"wanna": "want to",
|
|
"gotta": "got to",
|
|
"pokemons": "pokemon",
|
|
"pokmons": "pokemon",
|
|
"bcz": "because",
|
|
}
|
|
|
|
_NLTK_RESOURCES = [
|
|
"punkt",
|
|
"punkt_tab",
|
|
"stopwords",
|
|
"wordnet",
|
|
"averaged_perceptron_tagger",
|
|
"averaged_perceptron_tagger_eng",
|
|
]
|
|
|
|
|
|
def _import_nltk() -> Any:
|
|
"""Import NLTK lazily so this module can be imported before deps are installed."""
|
|
try:
|
|
import nltk # type: ignore
|
|
except ModuleNotFoundError as exc:
|
|
raise RuntimeError(
|
|
"NLTK is not installed. Install project dependencies with: pip install -r requirements.txt"
|
|
) from exc
|
|
return nltk
|
|
|
|
|
|
def ensure_nltk_resources(quiet: bool = True) -> None:
|
|
"""Download required NLTK resources if missing.
|
|
|
|
Safe to call at app startup (including inside Streamlit).
|
|
"""
|
|
nltk = _import_nltk()
|
|
for resource in _NLTK_RESOURCES:
|
|
try:
|
|
nltk.download(resource, quiet=quiet)
|
|
except Exception as exc:
|
|
raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc
|
|
|
|
|
|
def remove_punctuation(text: str) -> str:
|
|
mapping_table = text.maketrans("", "", string.punctuation)
|
|
return text.translate(mapping_table)
|
|
|
|
|
|
def remove_special_chars(text: str) -> str:
|
|
text = text.encode("ascii", "ignore").decode("ascii")
|
|
text = re.sub(r"[^a-zA-Z\s]", " ", text)
|
|
return re.sub(r"\s+", " ", text).strip()
|
|
|
|
|
|
def remove_short_words(text: str, min_len: int = 3) -> str:
|
|
return " ".join(word for word in text.split() if len(word) >= min_len)
|
|
|
|
|
|
def remove_alphanum_words(text: str) -> str:
|
|
words = text.split()
|
|
cleaned = [
|
|
word
|
|
for word in words
|
|
if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word))
|
|
]
|
|
return " ".join(cleaned)
|
|
|
|
|
|
def standardize(text: str, lookup: Dict[str, str] | None = None) -> str:
|
|
mapping = lookup or SLANG_LOOKUP
|
|
return " ".join(mapping.get(word, word) for word in text.split())
|
|
|
|
|
|
def _get_wordnet_pos(treebank_tag: str) -> str:
|
|
nltk = _import_nltk()
|
|
wordnet = nltk.corpus.wordnet
|
|
if treebank_tag.startswith("J"):
|
|
return wordnet.ADJ
|
|
if treebank_tag.startswith("V"):
|
|
return wordnet.VERB
|
|
if treebank_tag.startswith("N"):
|
|
return wordnet.NOUN
|
|
if treebank_tag.startswith("R"):
|
|
return wordnet.ADV
|
|
return wordnet.NOUN
|
|
|
|
|
|
def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]:
|
|
"""Run the full cleaning pipeline and return intermediate + final outputs.
|
|
|
|
Returns a dictionary so a UI can display each stage if desired.
|
|
"""
|
|
if not isinstance(raw_text, str):
|
|
raise TypeError("raw_text must be a string")
|
|
|
|
nltk = _import_nltk()
|
|
pos_tag = nltk.pos_tag
|
|
word_tokenize = nltk.word_tokenize
|
|
stopwords = nltk.corpus.stopwords
|
|
WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer
|
|
|
|
ensure_nltk_resources(quiet=True)
|
|
|
|
text = raw_text.lower()
|
|
text = remove_punctuation(text)
|
|
text = remove_alphanum_words(text)
|
|
text = remove_special_chars(text)
|
|
noise_removed = remove_short_words(text, min_len=min_len)
|
|
|
|
standardized = standardize(noise_removed)
|
|
|
|
tokens = word_tokenize(standardized)
|
|
|
|
stop_words = set(stopwords.words("english"))
|
|
tokens_no_stopwords = [token for token in tokens if token not in stop_words]
|
|
|
|
lem = WordNetLemmatizer()
|
|
pos_tags = pos_tag(tokens_no_stopwords)
|
|
lemmas = [
|
|
lem.lemmatize(token, _get_wordnet_pos(tag))
|
|
for token, tag in pos_tags
|
|
]
|
|
|
|
clean_text = " ".join(lemmas)
|
|
|
|
return {
|
|
"raw_text": raw_text,
|
|
"noise_removed": noise_removed,
|
|
"standardized": standardized,
|
|
"tokens": tokens,
|
|
"tokens_no_stopwords": tokens_no_stopwords,
|
|
"lemmas": lemmas,
|
|
"clean_text": clean_text,
|
|
}
|
|
|
|
|
|
def get_clean_text(raw_text: str, min_len: int = 3) -> str:
|
|
"""Small helper for app code that only needs the final cleaned text."""
|
|
return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]
|