first commit

2026-03-19 18:16:20 +01:00
commit 584b2e07b4
34 changed files with 4381 additions and 0 deletions
--- a/text-cleaner/.ipynb_checkpoints/pokemon_text_cleaning-checkpoint.ipynb
+++ b/text-cleaner/.ipynb_checkpoints/pokemon_text_cleaning-checkpoint.ipynb
@@ -0,0 +1,298 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🎴 Génération de Carte Pokémon depuis un Texte Descriptif\n",
+    "## Partie 1 — Nettoyage du Texte (NLU Pipeline)\n",
+    "\n",
+    "On prend un texte descriptif fourni par l'utilisateur et on le nettoie étape par étape.\n",
+    "\n",
+    "```\n",
+    "Texte brut → Noise Removal → Tokenization → Stopwords → Lemmatization → Texte propre\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 📦 Installation des dépendances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mRunning cells with 'Python 3.12.3' requires the ipykernel package.\n",
+      "\u001b[1;31m<a href='command:jupyter.createPythonEnvAndSelectController'>Create a Python Environment</a> with the required packages.\n",
+      "\u001b[1;31mOr install 'ipykernel' using the command: '/usr/bin/python3 -m pip install ipykernel -U --user --force-reinstall'"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install nltk --quiet\n",
+    "\n",
+    "import nltk\n",
+    "nltk.download('punkt', quiet=True)\n",
+    "nltk.download('punkt_tab', quiet=True)\n",
+    "nltk.download('stopwords', quiet=True)\n",
+    "nltk.download('wordnet', quiet=True)\n",
+    "nltk.download('averaged_perceptron_tagger', quiet=True)\n",
+    "nltk.download('averaged_perceptron_tagger_eng', quiet=True)\n",
+    "\n",
+    "print(\"✅ Dépendances installées !\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 📝 Saisie du texte utilisateur"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_text = \"\"\"\n",
+    "This is a HUGE fire dragon!!! It has got massive red wings and shoots \n",
+    "powerfull flames from its mouth... It's super fast n really strong!!\n",
+    "Its body is coverd with shiny golden scales & it lives in volcanos.\n",
+    "it luv to fight other pokémons and is very very aggressive >:(\n",
+    "\"\"\"\n",
+    "\n",
+    "print(\"📄 Texte brut :\")\n",
+    "print(raw_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 🧹 Étape 1 — Noise Removal\n",
+    "\n",
+    "On supprime la ponctuation, les caractères spéciaux, les mots trop courts, et on met tout en minuscules.\n",
+    "\n",
+    "> 📖 *Cours page 25-29 — `removePunctuation`, `removeShortWords`, `removePattern`*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import string\n",
+    "\n",
+    "def remove_punctuation(text):\n",
+    "    \"\"\"Supprime la ponctuation du texte.\"\"\"\n",
+    "    mapping_table = text.maketrans('', '', string.punctuation)\n",
+    "    return text.translate(mapping_table)\n",
+    "\n",
+    "def remove_special_chars(text):\n",
+    "    \"\"\"Supprime les caractères non-ASCII (emojis, accents parasites...).\"\"\"\n",
+    "    text = text.encode('ascii', 'ignore').decode('ascii')\n",
+    "    text = re.sub(r'[^a-zA-Z\\s]', ' ', text)\n",
+    "    return re.sub(r'\\s+', ' ', text).strip()\n",
+    "\n",
+    "def remove_short_words(text, min_len=3):\n",
+    "    \"\"\"Supprime les mots de moins de min_len caractères.\"\"\"\n",
+    "    return \" \".join([word for word in text.split() if len(word) >= min_len])\n",
+    "\n",
+    "# Application\n",
+    "text = raw_text.lower()                  # minuscules\n",
+    "text = remove_punctuation(text)          # ponctuation\n",
+    "text = remove_special_chars(text)        # caractères spéciaux\n",
+    "text = remove_short_words(text)          # mots trop courts\n",
+    "\n",
+    "print(\"🔇 Après Noise Removal :\")\n",
+    "print(text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 📖 Étape 2 — Object Standardization\n",
+    "\n",
+    "On remplace les abréviations et l'argot par leurs formes standard.\n",
+    "\n",
+    "> 📖 *Cours page 38 — lookup table `standardize`*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SLANG_LOOKUP = {\n",
+    "    \"n\":        \"and\",\n",
+    "    \"luv\":      \"love\",\n",
+    "    \"r\":        \"are\",\n",
+    "    \"u\":        \"you\",\n",
+    "    \"ur\":       \"your\",\n",
+    "    \"gonna\":    \"going to\",\n",
+    "    \"wanna\":    \"want to\",\n",
+    "    \"gotta\":    \"got to\",\n",
+    "    \"pokemons\": \"pokemon\",\n",
+    "    \"pokmons\":  \"pokemon\",\n",
+    "}\n",
+    "\n",
+    "def standardize(text, lookup=SLANG_LOOKUP):\n",
+    "    \"\"\"Remplace les mots d'argot par leur forme standard.\"\"\"\n",
+    "    words = text.split()\n",
+    "    return \" \".join([lookup.get(word, word) for word in words])\n",
+    "\n",
+    "text = standardize(text)\n",
+    "\n",
+    "print(\"📖 Après Standardisation :\")\n",
+    "print(text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## ✂️ Étape 3 — Tokenization\n",
+    "\n",
+    "On découpe le texte en tokens individuels.\n",
+    "\n",
+    "> 📖 *Cours page 31 — `word_tokenize` (NLTK)*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk import word_tokenize\n",
+    "\n",
+    "tokens = word_tokenize(text)\n",
+    "\n",
+    "print(f\"✂️ {len(tokens)} tokens :\")\n",
+    "print(tokens)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 🚫 Étape 4 — Suppression des Stopwords\n",
+    "\n",
+    "On retire les mots grammaticaux qui n'apportent pas de sens (\"the\", \"is\", \"a\"...).\n",
+    "\n",
+    "> 📖 *Cours page 27 — `cleanTextGT` avec `stopwords` (NLTK)*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "stop_words = set(stopwords.words('english'))\n",
+    "\n",
+    "tokens = [token for token in tokens if token not in stop_words]\n",
+    "\n",
+    "print(\"🚫 Tokens après suppression des stopwords :\")\n",
+    "print(tokens)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 🌿 Étape 5 — Lemmatization\n",
+    "\n",
+    "On réduit chaque mot à sa forme racine (`flames → flame`, `shooting → shoot`). On utilise le POS tag pour plus de précision.\n",
+    "\n",
+    "> 📖 *Cours page 36-37 — `WordNetLemmatizer` + POS tag*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "from nltk import pos_tag\n",
+    "from nltk.corpus import wordnet\n",
+    "\n",
+    "lem = WordNetLemmatizer()\n",
+    "\n",
+    "def get_wordnet_pos(treebank_tag):\n",
+    "    \"\"\"Convertit les tags Penn Treebank en tags WordNet.\"\"\"\n",
+    "    if treebank_tag.startswith('J'): return wordnet.ADJ\n",
+    "    elif treebank_tag.startswith('V'): return wordnet.VERB\n",
+    "    elif treebank_tag.startswith('N'): return wordnet.NOUN\n",
+    "    elif treebank_tag.startswith('R'): return wordnet.ADV\n",
+    "    else: return wordnet.NOUN\n",
+    "\n",
+    "pos_tags = pos_tag(tokens)\n",
+    "tokens = [lem.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]\n",
+    "\n",
+    "print(\"🌿 Tokens après Lemmatization :\")\n",
+    "print(tokens)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## ✅ Résultat final — Texte nettoyé"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clean_text = \" \".join(tokens)\n",
+    "\n",
+    "print(\"📄 Texte brut :\")\n",
+    "print(raw_text.strip())\n",
+    "print()\n",
+    "print(\"✅ Texte nettoyé :\")\n",
+    "print(clean_text)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/text-cleaner/.ipynb_checkpoints/text_cleaning_pipeline-checkpoint.py
+++ b/text-cleaner/.ipynb_checkpoints/text_cleaning_pipeline-checkpoint.py
@@ -0,0 +1,158 @@
+"""Reusable text-cleaning pipeline for Pokemon descriptions.
+
+This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API:
+- no input() calls
+- no print side effects
+- deterministic output for a given input
+"""
+
+from __future__ import annotations
+
+import re
+import string
+from typing import Any, Dict, List
+
+SLANG_LOOKUP: Dict[str, str] = {
+    "n": "and",
+    "luv": "love",
+    "r": "are",
+    "u": "you",
+    "ur": "your",
+    "gonna": "going to",
+    "wanna": "want to",
+    "gotta": "got to",
+    "pokemons": "pokemon",
+    "pokmons": "pokemon",
+    "bcz": "because",
+}
+
+_NLTK_RESOURCES = [
+    "punkt",
+    "punkt_tab",
+    "stopwords",
+    "wordnet",
+    "averaged_perceptron_tagger",
+    "averaged_perceptron_tagger_eng",
+]
+
+
+def _import_nltk() -> Any:
+    """Import NLTK lazily so this module can be imported before deps are installed."""
+    try:
+        import nltk  # type: ignore
+    except ModuleNotFoundError as exc:
+        raise RuntimeError(
+            "NLTK is not installed. Install project dependencies with: pip install -r requirements.txt"
+        ) from exc
+    return nltk
+
+
+def ensure_nltk_resources(quiet: bool = True) -> None:
+    """Download required NLTK resources if missing.
+
+    Safe to call at app startup (including inside Streamlit).
+    """
+    nltk = _import_nltk()
+    for resource in _NLTK_RESOURCES:
+        try:
+            nltk.download(resource, quiet=quiet)
+        except Exception as exc:
+            raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc
+
+
+def remove_punctuation(text: str) -> str:
+    mapping_table = text.maketrans("", "", string.punctuation)
+    return text.translate(mapping_table)
+
+
+def remove_special_chars(text: str) -> str:
+    text = text.encode("ascii", "ignore").decode("ascii")
+    text = re.sub(r"[^a-zA-Z\s]", " ", text)
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def remove_short_words(text: str, min_len: int = 3) -> str:
+    return " ".join(word for word in text.split() if len(word) >= min_len)
+
+
+def remove_alphanum_words(text: str) -> str:
+    words = text.split()
+    cleaned = [
+        word
+        for word in words
+        if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word))
+    ]
+    return " ".join(cleaned)
+
+
+def standardize(text: str, lookup: Dict[str, str] | None = None) -> str:
+    mapping = lookup or SLANG_LOOKUP
+    return " ".join(mapping.get(word, word) for word in text.split())
+
+
+def _get_wordnet_pos(treebank_tag: str) -> str:
+    nltk = _import_nltk()
+    wordnet = nltk.corpus.wordnet
+    if treebank_tag.startswith("J"):
+        return wordnet.ADJ
+    if treebank_tag.startswith("V"):
+        return wordnet.VERB
+    if treebank_tag.startswith("N"):
+        return wordnet.NOUN
+    if treebank_tag.startswith("R"):
+        return wordnet.ADV
+    return wordnet.NOUN
+
+
+def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]:
+    """Run the full cleaning pipeline and return intermediate + final outputs.
+
+    Returns a dictionary so a UI can display each stage if desired.
+    """
+    if not isinstance(raw_text, str):
+        raise TypeError("raw_text must be a string")
+
+    nltk = _import_nltk()
+    pos_tag = nltk.pos_tag
+    word_tokenize = nltk.word_tokenize
+    stopwords = nltk.corpus.stopwords
+    WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer
+
+    ensure_nltk_resources(quiet=True)
+
+    text = raw_text.lower()
+    text = remove_punctuation(text)
+    text = remove_alphanum_words(text)
+    text = remove_special_chars(text)
+    noise_removed = remove_short_words(text, min_len=min_len)
+
+    standardized = standardize(noise_removed)
+
+    tokens = word_tokenize(standardized)
+
+    stop_words = set(stopwords.words("english"))
+    tokens_no_stopwords = [token for token in tokens if token not in stop_words]
+
+    lem = WordNetLemmatizer()
+    pos_tags = pos_tag(tokens_no_stopwords)
+    lemmas = [
+        lem.lemmatize(token, _get_wordnet_pos(tag))
+        for token, tag in pos_tags
+    ]
+
+    clean_text = " ".join(lemmas)
+
+    return {
+        "raw_text": raw_text,
+        "noise_removed": noise_removed,
+        "standardized": standardized,
+        "tokens": tokens,
+        "tokens_no_stopwords": tokens_no_stopwords,
+        "lemmas": lemmas,
+        "clean_text": clean_text,
+    }
+
+
+def get_clean_text(raw_text: str, min_len: int = 3) -> str:
+    """Small helper for app code that only needs the final cleaned text."""
+    return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]