first commit
This commit is contained in:
@@ -0,0 +1,298 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 🎴 Génération de Carte Pokémon depuis un Texte Descriptif\n",
|
||||
"## Partie 1 — Nettoyage du Texte (NLU Pipeline)\n",
|
||||
"\n",
|
||||
"On prend un texte descriptif fourni par l'utilisateur et on le nettoie étape par étape.\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"Texte brut → Noise Removal → Tokenization → Stopwords → Lemmatization → Texte propre\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 📦 Installation des dépendances"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31mRunning cells with 'Python 3.12.3' requires the ipykernel package.\n",
|
||||
"\u001b[1;31m<a href='command:jupyter.createPythonEnvAndSelectController'>Create a Python Environment</a> with the required packages.\n",
|
||||
"\u001b[1;31mOr install 'ipykernel' using the command: '/usr/bin/python3 -m pip install ipykernel -U --user --force-reinstall'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install nltk --quiet\n",
|
||||
"\n",
|
||||
"import nltk\n",
|
||||
"nltk.download('punkt', quiet=True)\n",
|
||||
"nltk.download('punkt_tab', quiet=True)\n",
|
||||
"nltk.download('stopwords', quiet=True)\n",
|
||||
"nltk.download('wordnet', quiet=True)\n",
|
||||
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
|
||||
"nltk.download('averaged_perceptron_tagger_eng', quiet=True)\n",
|
||||
"\n",
|
||||
"print(\"✅ Dépendances installées !\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 📝 Saisie du texte utilisateur"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"raw_text = \"\"\"\n",
|
||||
"This is a HUGE fire dragon!!! It has got massive red wings and shoots \n",
|
||||
"powerfull flames from its mouth... It's super fast n really strong!!\n",
|
||||
"Its body is coverd with shiny golden scales & it lives in volcanos.\n",
|
||||
"it luv to fight other pokémons and is very very aggressive >:(\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"print(\"📄 Texte brut :\")\n",
|
||||
"print(raw_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 🧹 Étape 1 — Noise Removal\n",
|
||||
"\n",
|
||||
"On supprime la ponctuation, les caractères spéciaux, les mots trop courts, et on met tout en minuscules.\n",
|
||||
"\n",
|
||||
"> 📖 *Cours page 25-29 — `removePunctuation`, `removeShortWords`, `removePattern`*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"import string\n",
|
||||
"\n",
|
||||
"def remove_punctuation(text):\n",
|
||||
" \"\"\"Supprime la ponctuation du texte.\"\"\"\n",
|
||||
" mapping_table = text.maketrans('', '', string.punctuation)\n",
|
||||
" return text.translate(mapping_table)\n",
|
||||
"\n",
|
||||
"def remove_special_chars(text):\n",
|
||||
" \"\"\"Supprime les caractères non-ASCII (emojis, accents parasites...).\"\"\"\n",
|
||||
" text = text.encode('ascii', 'ignore').decode('ascii')\n",
|
||||
" text = re.sub(r'[^a-zA-Z\\s]', ' ', text)\n",
|
||||
" return re.sub(r'\\s+', ' ', text).strip()\n",
|
||||
"\n",
|
||||
"def remove_short_words(text, min_len=3):\n",
|
||||
" \"\"\"Supprime les mots de moins de min_len caractères.\"\"\"\n",
|
||||
" return \" \".join([word for word in text.split() if len(word) >= min_len])\n",
|
||||
"\n",
|
||||
"# Application\n",
|
||||
"text = raw_text.lower() # minuscules\n",
|
||||
"text = remove_punctuation(text) # ponctuation\n",
|
||||
"text = remove_special_chars(text) # caractères spéciaux\n",
|
||||
"text = remove_short_words(text) # mots trop courts\n",
|
||||
"\n",
|
||||
"print(\"🔇 Après Noise Removal :\")\n",
|
||||
"print(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 📖 Étape 2 — Object Standardization\n",
|
||||
"\n",
|
||||
"On remplace les abréviations et l'argot par leurs formes standard.\n",
|
||||
"\n",
|
||||
"> 📖 *Cours page 38 — lookup table `standardize`*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"SLANG_LOOKUP = {\n",
|
||||
" \"n\": \"and\",\n",
|
||||
" \"luv\": \"love\",\n",
|
||||
" \"r\": \"are\",\n",
|
||||
" \"u\": \"you\",\n",
|
||||
" \"ur\": \"your\",\n",
|
||||
" \"gonna\": \"going to\",\n",
|
||||
" \"wanna\": \"want to\",\n",
|
||||
" \"gotta\": \"got to\",\n",
|
||||
" \"pokemons\": \"pokemon\",\n",
|
||||
" \"pokmons\": \"pokemon\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"def standardize(text, lookup=SLANG_LOOKUP):\n",
|
||||
" \"\"\"Remplace les mots d'argot par leur forme standard.\"\"\"\n",
|
||||
" words = text.split()\n",
|
||||
" return \" \".join([lookup.get(word, word) for word in words])\n",
|
||||
"\n",
|
||||
"text = standardize(text)\n",
|
||||
"\n",
|
||||
"print(\"📖 Après Standardisation :\")\n",
|
||||
"print(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## ✂️ Étape 3 — Tokenization\n",
|
||||
"\n",
|
||||
"On découpe le texte en tokens individuels.\n",
|
||||
"\n",
|
||||
"> 📖 *Cours page 31 — `word_tokenize` (NLTK)*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from nltk import word_tokenize\n",
|
||||
"\n",
|
||||
"tokens = word_tokenize(text)\n",
|
||||
"\n",
|
||||
"print(f\"✂️ {len(tokens)} tokens :\")\n",
|
||||
"print(tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 🚫 Étape 4 — Suppression des Stopwords\n",
|
||||
"\n",
|
||||
"On retire les mots grammaticaux qui n'apportent pas de sens (\"the\", \"is\", \"a\"...).\n",
|
||||
"\n",
|
||||
"> 📖 *Cours page 27 — `cleanTextGT` avec `stopwords` (NLTK)*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from nltk.corpus import stopwords\n",
|
||||
"\n",
|
||||
"stop_words = set(stopwords.words('english'))\n",
|
||||
"\n",
|
||||
"tokens = [token for token in tokens if token not in stop_words]\n",
|
||||
"\n",
|
||||
"print(\"🚫 Tokens après suppression des stopwords :\")\n",
|
||||
"print(tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 🌿 Étape 5 — Lemmatization\n",
|
||||
"\n",
|
||||
"On réduit chaque mot à sa forme racine (`flames → flame`, `shooting → shoot`). On utilise le POS tag pour plus de précision.\n",
|
||||
"\n",
|
||||
"> 📖 *Cours page 36-37 — `WordNetLemmatizer` + POS tag*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from nltk.stem.wordnet import WordNetLemmatizer\n",
|
||||
"from nltk import pos_tag\n",
|
||||
"from nltk.corpus import wordnet\n",
|
||||
"\n",
|
||||
"lem = WordNetLemmatizer()\n",
|
||||
"\n",
|
||||
"def get_wordnet_pos(treebank_tag):\n",
|
||||
" \"\"\"Convertit les tags Penn Treebank en tags WordNet.\"\"\"\n",
|
||||
" if treebank_tag.startswith('J'): return wordnet.ADJ\n",
|
||||
" elif treebank_tag.startswith('V'): return wordnet.VERB\n",
|
||||
" elif treebank_tag.startswith('N'): return wordnet.NOUN\n",
|
||||
" elif treebank_tag.startswith('R'): return wordnet.ADV\n",
|
||||
" else: return wordnet.NOUN\n",
|
||||
"\n",
|
||||
"pos_tags = pos_tag(tokens)\n",
|
||||
"tokens = [lem.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]\n",
|
||||
"\n",
|
||||
"print(\"🌿 Tokens après Lemmatization :\")\n",
|
||||
"print(tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## ✅ Résultat final — Texte nettoyé"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clean_text = \" \".join(tokens)\n",
|
||||
"\n",
|
||||
"print(\"📄 Texte brut :\")\n",
|
||||
"print(raw_text.strip())\n",
|
||||
"print()\n",
|
||||
"print(\"✅ Texte nettoyé :\")\n",
|
||||
"print(clean_text)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,158 @@
|
||||
"""Reusable text-cleaning pipeline for Pokemon descriptions.
|
||||
|
||||
This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API:
|
||||
- no input() calls
|
||||
- no print side effects
|
||||
- deterministic output for a given input
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import string
|
||||
from typing import Any, Dict, List
|
||||
|
||||
SLANG_LOOKUP: Dict[str, str] = {
|
||||
"n": "and",
|
||||
"luv": "love",
|
||||
"r": "are",
|
||||
"u": "you",
|
||||
"ur": "your",
|
||||
"gonna": "going to",
|
||||
"wanna": "want to",
|
||||
"gotta": "got to",
|
||||
"pokemons": "pokemon",
|
||||
"pokmons": "pokemon",
|
||||
"bcz": "because",
|
||||
}
|
||||
|
||||
_NLTK_RESOURCES = [
|
||||
"punkt",
|
||||
"punkt_tab",
|
||||
"stopwords",
|
||||
"wordnet",
|
||||
"averaged_perceptron_tagger",
|
||||
"averaged_perceptron_tagger_eng",
|
||||
]
|
||||
|
||||
|
||||
def _import_nltk() -> Any:
|
||||
"""Import NLTK lazily so this module can be imported before deps are installed."""
|
||||
try:
|
||||
import nltk # type: ignore
|
||||
except ModuleNotFoundError as exc:
|
||||
raise RuntimeError(
|
||||
"NLTK is not installed. Install project dependencies with: pip install -r requirements.txt"
|
||||
) from exc
|
||||
return nltk
|
||||
|
||||
|
||||
def ensure_nltk_resources(quiet: bool = True) -> None:
|
||||
"""Download required NLTK resources if missing.
|
||||
|
||||
Safe to call at app startup (including inside Streamlit).
|
||||
"""
|
||||
nltk = _import_nltk()
|
||||
for resource in _NLTK_RESOURCES:
|
||||
try:
|
||||
nltk.download(resource, quiet=quiet)
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc
|
||||
|
||||
|
||||
def remove_punctuation(text: str) -> str:
|
||||
mapping_table = text.maketrans("", "", string.punctuation)
|
||||
return text.translate(mapping_table)
|
||||
|
||||
|
||||
def remove_special_chars(text: str) -> str:
|
||||
text = text.encode("ascii", "ignore").decode("ascii")
|
||||
text = re.sub(r"[^a-zA-Z\s]", " ", text)
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def remove_short_words(text: str, min_len: int = 3) -> str:
|
||||
return " ".join(word for word in text.split() if len(word) >= min_len)
|
||||
|
||||
|
||||
def remove_alphanum_words(text: str) -> str:
|
||||
words = text.split()
|
||||
cleaned = [
|
||||
word
|
||||
for word in words
|
||||
if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word))
|
||||
]
|
||||
return " ".join(cleaned)
|
||||
|
||||
|
||||
def standardize(text: str, lookup: Dict[str, str] | None = None) -> str:
|
||||
mapping = lookup or SLANG_LOOKUP
|
||||
return " ".join(mapping.get(word, word) for word in text.split())
|
||||
|
||||
|
||||
def _get_wordnet_pos(treebank_tag: str) -> str:
|
||||
nltk = _import_nltk()
|
||||
wordnet = nltk.corpus.wordnet
|
||||
if treebank_tag.startswith("J"):
|
||||
return wordnet.ADJ
|
||||
if treebank_tag.startswith("V"):
|
||||
return wordnet.VERB
|
||||
if treebank_tag.startswith("N"):
|
||||
return wordnet.NOUN
|
||||
if treebank_tag.startswith("R"):
|
||||
return wordnet.ADV
|
||||
return wordnet.NOUN
|
||||
|
||||
|
||||
def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]:
|
||||
"""Run the full cleaning pipeline and return intermediate + final outputs.
|
||||
|
||||
Returns a dictionary so a UI can display each stage if desired.
|
||||
"""
|
||||
if not isinstance(raw_text, str):
|
||||
raise TypeError("raw_text must be a string")
|
||||
|
||||
nltk = _import_nltk()
|
||||
pos_tag = nltk.pos_tag
|
||||
word_tokenize = nltk.word_tokenize
|
||||
stopwords = nltk.corpus.stopwords
|
||||
WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer
|
||||
|
||||
ensure_nltk_resources(quiet=True)
|
||||
|
||||
text = raw_text.lower()
|
||||
text = remove_punctuation(text)
|
||||
text = remove_alphanum_words(text)
|
||||
text = remove_special_chars(text)
|
||||
noise_removed = remove_short_words(text, min_len=min_len)
|
||||
|
||||
standardized = standardize(noise_removed)
|
||||
|
||||
tokens = word_tokenize(standardized)
|
||||
|
||||
stop_words = set(stopwords.words("english"))
|
||||
tokens_no_stopwords = [token for token in tokens if token not in stop_words]
|
||||
|
||||
lem = WordNetLemmatizer()
|
||||
pos_tags = pos_tag(tokens_no_stopwords)
|
||||
lemmas = [
|
||||
lem.lemmatize(token, _get_wordnet_pos(tag))
|
||||
for token, tag in pos_tags
|
||||
]
|
||||
|
||||
clean_text = " ".join(lemmas)
|
||||
|
||||
return {
|
||||
"raw_text": raw_text,
|
||||
"noise_removed": noise_removed,
|
||||
"standardized": standardized,
|
||||
"tokens": tokens,
|
||||
"tokens_no_stopwords": tokens_no_stopwords,
|
||||
"lemmas": lemmas,
|
||||
"clean_text": clean_text,
|
||||
}
|
||||
|
||||
|
||||
def get_clean_text(raw_text: str, min_len: int = 3) -> str:
|
||||
"""Small helper for app code that only needs the final cleaned text."""
|
||||
return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]
|
||||
Reference in New Issue
Block a user