first commit

This commit is contained in:
2026-03-19 18:16:20 +01:00
commit 584b2e07b4
34 changed files with 4381 additions and 0 deletions

67
CLAUDE.md Normal file
View File

@@ -0,0 +1,67 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
Juicepyter is a Pokémon card generator pipeline that takes a natural language description, cleans it, extracts structured JSON metadata, and generates a card image using a LoRA-finetuned Stable Diffusion model. A Streamlit UI (`app.py`) ties it all together.
## Architecture — Three-Stage Pipeline
The pipeline (`prompt_to_card_pipeline.py`) orchestrates three stages:
1. **Text cleaning** (`text-cleaner/text_cleaning_pipeline.py`): NLTK-based pipeline — lowercasing, punctuation/slang removal, stopword filtering, POS-aware lemmatization. Entry point: `get_clean_text(raw_text) -> str`.
2. **Keyword extraction + JSON inference** (`clean-text-to-keywords/`): spaCy + YAKE keyword extraction (`keyword_extractor.py`) → rule-based JSON inference (`json_inference.py`) that populates a TCG-style card template. CLI: `infer_json_usage.py`. No LLM calls — deterministic and rule-based.
3. **Card image generation** (`card_generator_adapter.py`): Loads `runwayml/stable-diffusion-v1-5` with a LoRA adapter (PEFT) from `pokemon_card_lora/`, converts metadata to a SD prompt via `metadata_to_conditioning()`, runs inference. The generator module is pluggable via `--generator-module`.
`fetch_card.py` is a standalone data collection script that downloads real Pokémon TCG card images with embedded metadata using the TCGdex SDK.
## Commands
### Run the Streamlit app
```bash
streamlit run app.py
```
### Run the full pipeline CLI
```bash
python prompt_to_card_pipeline.py "description text" \
--text-cleaner-path text-cleaner/text_cleaning_pipeline.py \
--infer-script-path clean-text-to-keywords/infer_json_usage.py \
--checkpoint pokemon_card_lora \
--template clean-text-to-keywords/json_template_example.json \
--generator-module card_generator_adapter.py \
--device cpu \
--save-path generated_card.png \
--print-json
```
### Run keyword extraction + JSON inference only
```bash
cd clean-text-to-keywords
python infer_json_usage.py --template json_template_example.json "your pokemon description"
```
### Tests
```bash
cd clean-text-to-keywords
python -m unittest -q
```
## Dependencies
- **text-cleaner**: `nltk` (punkt, stopwords, wordnet, averaged_perceptron_tagger)
- **clean-text-to-keywords**: `spacy>=3.7.0`, `yake>=0.4.2`, spaCy model `en_core_web_sm`
- **card generation**: `diffusers`, `torch`, `peft`, `transformers`, `accelerate`, `safetensors`
- **app**: `streamlit`, `Pillow`
- **fetch_card**: `tcgdexsdk`, `Pillow`
Python 3.13 or lower recommended (spaCy compatibility).
## Key Design Decisions
- The generator module pattern is pluggable: any module with `build_pipeline(checkpoint_path, device)` and optionally `metadata_to_conditioning(meta)` can be swapped in via `--generator-module`.
- The JSON inference stage preserves non-empty fields in the provided template — only empty fields get populated.
- The LoRA base model is `runwayml/stable-diffusion-v1-5` with PEFT adapter weights in `pokemon_card_lora/`.

130
app.py Normal file
View File

@@ -0,0 +1,130 @@
import streamlit as st
import subprocess
import sys
import shlex
from pathlib import Path
from PIL import Image
APP_DIR = Path(__file__).resolve().parent
PIPELINE_SCRIPT = APP_DIR / "prompt_to_card_pipeline.py"
TEXT_CLEANER_PATH = APP_DIR / "text-cleaner" / "text_cleaning_pipeline.py"
INFER_SCRIPT_PATH = APP_DIR / "clean-text-to-keywords" / "infer_json_usage.py"
CHECKPOINT_PATH = APP_DIR / "pokemon_card_lora" / "training_history.pt"
TEMPLATE_PATH = APP_DIR / "clean-text-to-keywords" / "json_template_example.json"
IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".webp", ".bmp")
def _extract_image_from_stdout(stdout: str) -> Path | None:
for line in reversed(stdout.splitlines()):
text = line.strip().strip("\"'")
if not text:
continue
candidate = Path(text)
if not candidate.is_absolute():
candidate = APP_DIR / candidate
if candidate.suffix.lower() in IMAGE_EXTENSIONS and candidate.exists():
return candidate
return None
def run_prompt_pipeline(prompt_text: str) -> tuple[Path | None, str, list[str]]:
cmd = [
"python prompt_to_card_pipeline.py ",
prompt_text,
"--text-cleaner-path text-cleaner/text_cleaning_pipeline.py --infer-script-path clean-text-to-keywords/infer_json_usage.py --checkpoint pokemon_card_lora --template clean-text-to-keywords/json_template_example.json --generator-module card_generator_adapter.py --device cuda --save-path generated_card.png --print-json"
]
result = subprocess.run(
cmd,
cwd=APP_DIR,
capture_output=True,
text=True,
check=False,
)
full_output = (result.stdout or "") + ("\n" + result.stderr if result.stderr else "")
if result.returncode != 0:
return None, full_output.strip() or "Erreur inconnue pendant le pipeline.", cmd
image_path = _extract_image_from_stdout(result.stdout or "")
return image_path, full_output.strip(), cmd
# ------------------------------------------------------------------ #
# Configuration #
# ------------------------------------------------------------------ #
st.set_page_config(
page_title="Générateur de Carte Pokémon",
page_icon=Image.open(Path(__file__).with_name("pokeball.png")),
layout="centered",
)
logo_col, title_col = st.columns([1, 6], vertical_alignment="center")
with logo_col:
st.image(Image.open(Path(__file__).with_name("pokeball.png")), width=72)
with title_col:
st.title("Générateur de Carte Pokémon")
st.markdown("Décrivez votre Pokémon en langage naturel et laissez la magie opérer !")
# ------------------------------------------------------------------ #
# Saisie utilisateur #
# ------------------------------------------------------------------ #
raw_text = st.text_area(
label="Description de votre Pokémon",
placeholder=(
"Ex: My pokemon is called Pyrokar! Its a huge fire dragon with massive "
"red wings and shoots flames from its mouth... super fast n aggressive >:("
),
height=180,
)
st.markdown(
"""
<style>
div.stButton > button {
background-color: #d62828;
color: white;
border: 1px solid #b91c1c;
}
div.stButton > button:hover {
background-color: #b91c1c;
border-color: #991b1b;
color: white;
}
</style>
""",
unsafe_allow_html=True,
)
generate = st.button(" Générer la carte", use_container_width=True)
# ------------------------------------------------------------------ #
# Pipeline #
# ------------------------------------------------------------------ #
if generate:
if not raw_text.strip():
st.warning("Veuillez entrer une description avant de générer.")
else:
with st.spinner("Génération de la carte Pokémon..."):
image, logs = run_prompt_pipeline(raw_text)
if image is not None:
st.image(image, caption="Carte Pokémon générée", width="stretch")
if logs:
with st.expander("Logs pipeline"):
st.code(logs)
else:
st.error("Aucune image générée détectée. Vérifiez les chemins du pipeline.")
if logs:
with st.expander("Logs pipeline"):
st.code(logs)

107
card_generator_adapter.py Normal file
View File

@@ -0,0 +1,107 @@
"""Adapter to load the LoRA checkpoint and define conditioning logic.
Customize this file to match your model architecture, then use:
--generator-module card_generator_adapter.py
"""
from __future__ import annotations
from typing import Any, Mapping
def build_pipeline(checkpoint_path: str, device: str):
"""Load LoRA adapter and return a callable pipeline.
The pipeline must accept:
pipeline(prompt_or_conditioning, num_inference_steps=30, guidance_scale=7.5)
and return an object with .images attribute.
"""
from pathlib import Path
checkpoint_input = Path(checkpoint_path).expanduser().resolve()
if checkpoint_input.is_dir():
checkpoint_dir = checkpoint_input
elif checkpoint_input.exists():
checkpoint_dir = checkpoint_input.parent
else:
raise FileNotFoundError(f"Checkpoint path not found: {checkpoint_input}")
# Load base Stable Diffusion model + LoRA adapter (PEFT)
try:
from diffusers import StableDiffusionPipeline
import torch
except ImportError as e:
raise RuntimeError(
f"diffusers and torch required. Install: pip install diffusers torch "
f"(error: {e})"
)
# Load base model
model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(
model_id,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
)
pipe = pipe.to(device)
# Load LoRA weights from adapter_model.safetensors
adapter_path = checkpoint_dir / "adapter_model.safetensors"
if adapter_path.exists():
try:
pipe.load_lora_weights(str(checkpoint_dir))
except Exception as e:
message = str(e)
if "PEFT backend is required" in message:
raise RuntimeError(
"Failed to load LoRA: PEFT backend is missing. "
"Install required packages with: pip install peft transformers accelerate safetensors"
) from e
raise RuntimeError(
f"Failed to load LoRA from {checkpoint_dir}: {e}\n"
"Ensure adapter_config.json and adapter_model.safetensors are present."
) from e
else:
raise FileNotFoundError(
f"LoRA adapter not found at {adapter_path}. "
f"Expected: adapter_model.safetensors in {checkpoint_dir}"
)
return pipe
def metadata_to_conditioning(meta: Mapping[str, Any]) -> str:
"""Convert metadata dict to a Stable Diffusion prompt.
LoRA is trained on Pokemon cards, so describe it as such.
"""
name = str(meta.get("name", "Unknown Pokemon"))
pokemon_type = str(meta.get("type", "normal")).capitalize()
secondary = meta.get("secondary_type")
hp = str(meta.get("hp", "60"))
attacks = meta.get("attacks") or []
attack_list = []
if isinstance(attacks, list):
for atk in attacks:
if isinstance(atk, dict):
attack_list.append(str(atk.get("name", "")).lower())
elif atk:
attack_list.append(str(atk).lower())
# Build a descriptive prompt for card generation
prompt = f"Pokemon trading card of {name}, {pokemon_type}-type Pokemon"
if secondary:
prompt += f"/{secondary.capitalize()}"
prompt += f", HP {hp}"
if attack_list:
prompt += f", with attacks: {', '.join(attack_list[:2])}"
description = meta.get("description", "").strip()
if description:
prompt += f". {description}"
prompt += ". High quality illustration, official Pokemon card style."
return prompt

View File

@@ -0,0 +1,189 @@
# Pokemon Text-to-JSON Pipeline
This project converts free-form Pokemon description text into:
1. A normalized keyword list
2. A populated Pokemon JSON object (from a blank/key-only template)
The pipeline is deterministic and rule-based.
## Architecture
### Stage 1: Keyword Extraction
File: `keyword_extractor.py`
Input: raw text description
Core logic:
- spaCy tokenization and POS tagging
- POS filtering (`NOUN`, `ADJ`, `VERB`)
- stopword and punctuation removal
- lemma-based normalization
- domain synonym normalization (example: `flames -> fire`)
- optional YAKE relevance scoring
- conservative retention policy so detail is not over-pruned
Output: ordered list of normalized keywords
### Stage 2: JSON Inference
File: `json_inference.py`
Input: keyword list + optional JSON template
Core logic:
- infer primary/secondary type
- infer name candidate
- infer attacks, abilities, habitat, personality
- infer basic stats (`hp`, `attack`, `defense`, `speed`)
- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.)
- preserve already non-empty values in the provided template
Output: inferred JSON profile
### Stage 3: Orchestration CLI
File: `infer_json_usage.py`
This is the main entrypoint for end-to-end usage.
Default behavior:
1. prints extracted keyword list
2. prints inferred JSON
## Project Structure
- `keyword_extractor.py`: keyword extraction engine
- `json_inference.py`: keyword-to-JSON inference logic
- `infer_json_usage.py`: end-to-end CLI
- `example_usage.py`: keyword extraction only CLI
- `json_template_example.json`: sample blank/key-only template
- `test_keyword_extractor.py`: extraction tests
- `test_json_inference.py`: inference tests
- `requirements.txt`: Python dependencies
## Requirements
- Python 3.13 or lower is recommended for spaCy compatibility
- pip
Dependencies in `requirements.txt`:
- `spacy>=3.7.0`
- `yake>=0.4.2`
## Setup
1. Create and activate a virtual environment (recommended)
```bash
python -m venv .venv
source .venv/bin/activate
```
2. Install dependencies
```bash
pip install -r requirements.txt
```
3. Install spaCy English model
```bash
python -m spacy download en_core_web_sm
```
## How To Run
### A) Extract keywords only
```bash
python example_usage.py "furret long slender agile creature with soft fur"
```
Output: JSON list of keywords.
### B) End-to-end: text -> keywords -> JSON
```bash
python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur"
```
Output order:
1. keyword list
2. inferred JSON
### C) End-to-end but JSON only
```bash
python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur"
```
### D) Start from keywords directly
```bash
python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance
```
Tip: If you pass `--keywords`, text extraction is skipped.
## Template Behavior
If `--template` is omitted, inference returns a full inferred profile object.
If `--template` is provided:
- empty fields are populated from inferred values
- non-empty fields are preserved
Current sample template supports nested card-like data including:
- `types`
- `attacks` with `cost`, `name`, `effect`, `damage`
- `weaknesses` with `type`, `value`
- `stage`, `retreat`, `legal`
## Tests
Run all tests:
```bash
python -m unittest -q
```
## Troubleshooting
### 1) spaCy model not found
Error mentions `en_core_web_sm` not installed.
Fix:
```bash
python -m spacy download en_core_web_sm
```
### 2) spaCy import/runtime problems on very new Python versions
Use Python 3.13 or lower and reinstall requirements.
### 3) `--template` path errors
Ensure `--template` points to a valid file path, for example:
```bash
--template json_template_example.json
```
If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`.
## Design Notes
- deterministic and explainable (no LLM calls)
- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py`
- scoring and template fill rules are intentionally simple and stable for game-content generation

View File

@@ -0,0 +1,36 @@
import argparse
import json
from typing import Sequence
from keyword_extractor import KeywordExtractor
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Extract normalized keywords from cleaned text.",
)
parser.add_argument(
"text",
nargs="+",
help="Input text to process. Pass as one quoted string or multiple words.",
)
parser.add_argument(
"--model",
default="en_core_web_sm",
help="spaCy model name (default: en_core_web_sm).",
)
return parser
def main(argv: Sequence[str] | None = None) -> None:
parser = _build_parser()
args = parser.parse_args(argv)
text = " ".join(args.text)
extractor = KeywordExtractor.from_default_model(model_name=args.model)
keywords = extractor.extract(text)
print(json.dumps(keywords))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,137 @@
"""Rule-based keyword extraction and normalization for Pokemon card generation."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple
# Canonical concept -> synonym list
from typing import Dict, List
DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = {
"normal": ["basic", "common", "regular", "plain"],
"fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"],
"water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"],
"grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"],
"flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"],
"fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"],
"poison": ["toxic", "venom", "acid", "poisonous", "toxin"],
"electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"],
"ground": ["earth", "soil", "sand", "mud", "quake", "dust"],
"rock": ["stone", "boulder", "crystal", "rocky", "pebble"],
"psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"],
"ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"],
"bug": ["insect", "ant", "beetle", "spider", "crawler"],
"ghost": ["spirit", "phantom", "haunt", "shadow", "specter"],
"steel": ["metal", "iron", "armor", "blade", "alloy"],
"dragon": ["drake", "wyrm", "serpent", "legendary"],
"dark": ["shadow", "evil", "night", "doom", "darkness"],
"fairy": ["magic", "magical", "sparkle", "light", "charm"],
}
DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB")
def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]:
"""Build synonym -> canonical mapping for O(1) normalization lookup."""
inverse: Dict[str, str] = {}
for canonical, synonyms in normalization_map.items():
canonical_normalized = canonical.strip().lower()
inverse[canonical_normalized] = canonical_normalized
for synonym in synonyms:
synonym_normalized = synonym.strip().lower()
if synonym_normalized:
inverse[synonym_normalized] = canonical_normalized
return inverse
def _deduplicate_preserve_order(items: Iterable[str]) -> List[str]:
seen: Set[str] = set()
output: List[str] = []
for item in items:
if item not in seen:
seen.add(item)
output.append(item)
return output
@dataclass
class KeywordExtractor:
"""Deterministic spaCy + rule-based keyword extraction pipeline."""
nlp: Any
normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP)
allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS)
def __post_init__(self) -> None:
self._normalization_lookup = _invert_normalization_map(self.normalization_map)
self._allowed_pos_set = set(self.allowed_pos)
@classmethod
def from_default_model(
cls,
model_name: str = "en_core_web_sm",
normalization_map: Optional[Mapping[str, Iterable[str]]] = None,
allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS,
) -> "KeywordExtractor":
"""Initialize extractor with a spaCy English pipeline."""
try:
import spacy
nlp = spacy.load(model_name)
except OSError as exc:
raise OSError(
f"spaCy model '{model_name}' is not installed. "
"Run: python -m spacy download en_core_web_sm"
) from exc
except Exception as exc:
raise RuntimeError(
"spaCy could not be loaded in this Python environment. "
"Try Python 3.13 or lower, then install spaCy and en_core_web_sm."
) from exc
return cls(
nlp=nlp,
normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP,
allowed_pos=allowed_pos,
)
def extract(self, text: str) -> List[str]:
"""Extract and normalize keywords from already-cleaned text."""
if not text or not text.strip():
return []
doc = self.nlp(text)
# Step 1: POS filtering + base normalization to lowercase lemmas/tokens.
raw_keywords: List[str] = []
for token in doc:
if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set:
continue
# Use lemma where possible to collapse inflections.
base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip()
if base:
raw_keywords.append(base)
# Step 2: Deduplicate before domain normalization (as requested in README).
deduplicated = _deduplicate_preserve_order(raw_keywords)
# Step 3: Map variants/synonyms to canonical concepts.
normalized = [self._normalize_keyword(keyword) for keyword in deduplicated]
# Step 4: Deduplicate again, since multiple words can map to one concept.
return _deduplicate_preserve_order(normalized)
def _normalize_keyword(self, keyword: str) -> str:
keyword_lower = keyword.lower()
return self._normalization_lookup.get(keyword_lower, keyword_lower)
def extract_keywords(
text: str,
extractor: Optional[KeywordExtractor] = None,
) -> List[str]:
"""Convenience API to extract keywords with default extractor config."""
active_extractor = extractor or KeywordExtractor.from_default_model()
return active_extractor.extract(text)

View File

@@ -0,0 +1,88 @@
import unittest
from keyword_extractor import KeywordExtractor
class FakeToken:
def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
self.text = text
self.pos_ = pos
self.lemma_ = lemma
self.is_stop = is_stop
self.is_punct = not any(ch.isalnum() for ch in text)
class FakeNLP:
def __init__(self, tag_map, stopwords) -> None:
self.tag_map = tag_map
self.stopwords = stopwords
def __call__(self, text: str):
tokens = []
for raw in text.split():
token_text = raw.strip()
lowered = token_text.lower()
tokens.append(
FakeToken(
text=token_text,
pos=self.tag_map.get(lowered, "NOUN"),
lemma=lowered,
is_stop=lowered in self.stopwords,
)
)
return tokens
class KeywordExtractorTests(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
tag_map = {
"fiery": "ADJ",
"dragon": "NOUN",
"attack": "VERB",
"explosive": "ADJ",
"flames": "NOUN",
"burning": "ADJ",
"creature": "NOUN",
"with": "ADP",
"blaze": "NOUN",
"power": "NOUN",
"electric": "ADJ",
"mouse": "NOUN",
"using": "VERB",
"thunder": "NOUN",
"shock": "NOUN",
"a": "DET",
"very": "ADV",
"strong": "ADJ",
"and": "CCONJ",
"dangerous": "ADJ",
}
stopwords = {"a", "very", "and", "with"}
cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
cls.extractor = KeywordExtractor(nlp=cls.nlp)
def test_readme_main_example(self) -> None:
text = "fiery dragon attack explosive flames"
result = self.extractor.extract(text)
self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])
def test_synonym_normalization(self) -> None:
text = "burning creature with blaze power"
result = self.extractor.extract(text)
self.assertEqual(result, ["fire", "creature", "power"])
def test_mixed_types(self) -> None:
text = "electric mouse using thunder shock"
result = self.extractor.extract(text)
self.assertEqual(result, ["electric", "mouse", "using"])
def test_noise_input(self) -> None:
text = "a very very strong and dangerous creature"
result = self.extractor.extract(text)
self.assertEqual(result, ["strong", "dangerous", "creature"])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,189 @@
# Pokemon Text-to-JSON Pipeline
This project converts free-form Pokemon description text into:
1. A normalized keyword list
2. A populated Pokemon JSON object (from a blank/key-only template)
The pipeline is deterministic and rule-based.
## Architecture
### Stage 1: Keyword Extraction
File: `keyword_extractor.py`
Input: raw text description
Core logic:
- spaCy tokenization and POS tagging
- POS filtering (`NOUN`, `ADJ`, `VERB`)
- stopword and punctuation removal
- lemma-based normalization
- domain synonym normalization (example: `flames -> fire`)
- optional YAKE relevance scoring
- conservative retention policy so detail is not over-pruned
Output: ordered list of normalized keywords
### Stage 2: JSON Inference
File: `json_inference.py`
Input: keyword list + optional JSON template
Core logic:
- infer primary/secondary type
- infer name candidate
- infer attacks, abilities, habitat, personality
- infer basic stats (`hp`, `attack`, `defense`, `speed`)
- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.)
- preserve already non-empty values in the provided template
Output: inferred JSON profile
### Stage 3: Orchestration CLI
File: `infer_json_usage.py`
This is the main entrypoint for end-to-end usage.
Default behavior:
1. prints extracted keyword list
2. prints inferred JSON
## Project Structure
- `keyword_extractor.py`: keyword extraction engine
- `json_inference.py`: keyword-to-JSON inference logic
- `infer_json_usage.py`: end-to-end CLI
- `example_usage.py`: keyword extraction only CLI
- `json_template_example.json`: sample blank/key-only template
- `test_keyword_extractor.py`: extraction tests
- `test_json_inference.py`: inference tests
- `requirements.txt`: Python dependencies
## Requirements
- Python 3.13 or lower is recommended for spaCy compatibility
- pip
Dependencies in `requirements.txt`:
- `spacy>=3.7.0`
- `yake>=0.4.2`
## Setup
1. Create and activate a virtual environment (recommended)
```bash
python -m venv .venv
source .venv/bin/activate
```
2. Install dependencies
```bash
pip install -r requirements.txt
```
3. Install spaCy English model
```bash
python -m spacy download en_core_web_sm
```
## How To Run
### A) Extract keywords only
```bash
python example_usage.py "furret long slender agile creature with soft fur"
```
Output: JSON list of keywords.
### B) End-to-end: text -> keywords -> JSON
```bash
python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur"
```
Output order:
1. keyword list
2. inferred JSON
### C) End-to-end but JSON only
```bash
python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur"
```
### D) Start from keywords directly
```bash
python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance
```
Tip: If you pass `--keywords`, text extraction is skipped.
## Template Behavior
If `--template` is omitted, inference returns a full inferred profile object.
If `--template` is provided:
- empty fields are populated from inferred values
- non-empty fields are preserved
Current sample template supports nested card-like data including:
- `types`
- `attacks` with `cost`, `name`, `effect`, `damage`
- `weaknesses` with `type`, `value`
- `stage`, `retreat`, `legal`
## Tests
Run all tests:
```bash
python -m unittest -q
```
## Troubleshooting
### 1) spaCy model not found
Error mentions `en_core_web_sm` not installed.
Fix:
```bash
python -m spacy download en_core_web_sm
```
### 2) spaCy import/runtime problems on very new Python versions
Use Python 3.13 or lower and reinstall requirements.
### 3) `--template` path errors
Ensure `--template` points to a valid file path, for example:
```bash
--template json_template_example.json
```
If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`.
## Design Notes
- deterministic and explainable (no LLM calls)
- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py`
- scoring and template fill rules are intentionally simple and stable for game-content generation

View File

@@ -0,0 +1,36 @@
import argparse
import json
from typing import Sequence
from keyword_extractor import KeywordExtractor
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Extract normalized keywords from cleaned text.",
)
parser.add_argument(
"text",
nargs="+",
help="Input text to process. Pass as one quoted string or multiple words.",
)
parser.add_argument(
"--model",
default="en_core_web_sm",
help="spaCy model name (default: en_core_web_sm).",
)
return parser
def main(argv: Sequence[str] | None = None) -> None:
parser = _build_parser()
args = parser.parse_args(argv)
text = " ".join(args.text)
extractor = KeywordExtractor.from_default_model(model_name=args.model)
keywords = extractor.extract(text)
print(json.dumps(keywords))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,111 @@
import argparse
import json
import os
import re
from typing import Sequence
from keyword_extractor import KeywordExtractor
from json_inference import fill_template_from_keywords
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Extract keywords and infer values into a JSON template.",
)
parser.add_argument(
"text",
nargs="*",
help="Input description text.",
)
parser.add_argument(
"--template",
default="",
help="Path to JSON template file with keys only. If omitted, full inferred JSON is returned.",
)
parser.add_argument(
"--model",
default="en_core_web_sm",
help="spaCy model name (default: en_core_web_sm).",
)
parser.add_argument(
"--keywords",
nargs="+",
default=None,
help="Provide keywords directly instead of raw text.",
)
parser.add_argument(
"--json-only",
action="store_true",
help="Print only inferred JSON (skip keyword list output).",
)
return parser
def _load_template(path: str):
if not path:
return {}
if not os.path.exists(path):
raise FileNotFoundError(f"Template file not found: {path}")
with open(path, "r", encoding="utf-8") as file_handle:
raw = file_handle.read().strip()
if not raw:
return {}
return json.loads(raw)
def _parse_keywords_fragment(raw: str):
if not raw.strip():
return []
try:
parsed = json.loads(raw)
if isinstance(parsed, list):
return [str(item).strip().lower() for item in parsed if str(item).strip()]
except json.JSONDecodeError:
pass
tokens = re.findall(r"[a-zA-Z0-9_-]+", raw.lower())
return [token for token in tokens if token]
def _extract_keywords(args):
if args.keywords:
return [word.strip().lower() for word in args.keywords if word.strip()]
if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["):
raw = " ".join([args.template] + args.text)
return _parse_keywords_fragment(raw)
if not args.text:
raise ValueError("Provide input text or use --keywords.")
text = " ".join(args.text)
extractor = KeywordExtractor.from_default_model(model_name=args.model)
return extractor.extract(text)
def main(argv: Sequence[str] | None = None) -> None:
parser = _build_parser()
args = parser.parse_args(argv)
keywords = _extract_keywords(args)
template_path = args.template
if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["):
template_path = ""
template = _load_template(template_path)
inferred_json = fill_template_from_keywords(template, keywords)
if args.json_only:
print(json.dumps(inferred_json, indent=2))
return
print(json.dumps(keywords))
print(json.dumps(inferred_json, indent=2))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,398 @@
"""Infer Pokemon-like JSON values from extracted keywords."""
from __future__ import annotations
from copy import deepcopy
from typing import Any, Dict, Iterable, List, Mapping, Sequence
POKEMON_TYPES = {
"normal",
"fire",
"water",
"grass",
"electric",
"ice",
"fighting",
"poison",
"ground",
"flying",
"psychic",
"bug",
"rock",
"ghost",
"dragon",
"dark",
"steel",
"fairy",
}
HABITAT_KEYWORDS = {
"forest",
"field",
"cave",
"mountain",
"river",
"ocean",
"sea",
"tunnel",
"nest",
"sky",
"desert",
"swamp",
"volcano",
}
PERSONALITY_KEYWORDS = {
"calm",
"gentle",
"agile",
"playful",
"cheerful",
"energetic",
"curious",
"fierce",
"brave",
"loyal",
"timid",
"bold",
}
MOVE_KEYWORDS = {
"attack",
"smash",
"strike",
"kick",
"punch",
"shock",
"thunder",
"bolt",
"blast",
"explosion",
"freeze",
"bite",
"claw",
"tail",
"fight",
}
ABILITY_KEYWORDS = {
"recover",
"endurance",
"explore",
"hide",
"wander",
"bond",
"speed",
"power",
"energy",
"flexible",
}
STAT_HINTS = {
"hp": {"endurance", "recover", "energy", "stamina", "healthy", "vital"},
"attack": {"attack", "smash", "strike", "punch", "kick", "claw", "fight", "power"},
"defense": {"armor", "shield", "tough", "hard", "resist", "solid"},
"speed": {"speed", "swift", "agile", "quick", "fast", "dash"},
}
KEY_ALIASES = {
"name": {"name", "pokemon_name"},
"type": {"type", "primary_type", "pokemon_type"},
"secondary_type": {"secondary_type", "type2", "secondary"},
"attacks": {"attacks", "moves", "skills", "offense"},
"abilities": {"abilities", "traits", "passives", "special_abilities"},
"habitat": {"habitat", "environment", "region"},
"personality": {"personality", "temperament", "nature"},
"description": {"description", "flavor_text", "summary", "lore"},
"keywords": {"keywords", "tags"},
"hp": {"hp", "health", "health_points"},
"attack": {"attack", "atk"},
"defense": {"defense", "def"},
"speed": {"speed", "spd"},
}
GENERIC_NAME_BLACKLIST = {
"black",
"white",
"yellow",
"red",
"blue",
"green",
"purple",
"orange",
"pink",
"gray",
"grey",
"brown",
"fur",
"body",
"tail",
"claw",
"storm",
"cloud",
"enemy",
"super",
"scary",
"giant",
"speed",
}
TYPE_WEAKNESS = {
"normal": "fighting",
"fire": "water",
"water": "electric",
"grass": "fire",
"electric": "ground",
"ice": "fire",
"fighting": "psychic",
"poison": "ground",
"ground": "water",
"flying": "electric",
"psychic": "dark",
"bug": "fire",
"rock": "water",
"ghost": "dark",
"dragon": "fairy",
"dark": "fighting",
"steel": "fire",
"fairy": "steel",
}
def _title_case(value: str) -> str:
return " ".join(part.capitalize() for part in value.split())
def _is_empty_value(value: Any) -> bool:
if value is None:
return True
if isinstance(value, str):
return value.strip() == ""
if isinstance(value, (list, dict, tuple, set)):
return len(value) == 0
return False
def _canonical_key(key: str) -> str:
lowered = key.lower().strip()
for canonical, aliases in KEY_ALIASES.items():
if lowered in aliases:
return canonical
return lowered
def _pick_name(keywords: Sequence[str]) -> str:
for keyword in keywords:
if keyword in POKEMON_TYPES:
continue
if keyword in HABITAT_KEYWORDS:
continue
if keyword in MOVE_KEYWORDS:
continue
if keyword in ABILITY_KEYWORDS:
continue
if keyword in PERSONALITY_KEYWORDS:
continue
if keyword in GENERIC_NAME_BLACKLIST:
continue
if len(keyword) < 4:
continue
return _title_case(keyword)
return "Unknown"
def _pick_types(keywords: Sequence[str]) -> List[str]:
types: List[str] = []
for keyword in keywords:
if keyword in POKEMON_TYPES and keyword not in types:
types.append(keyword)
if len(types) >= 2:
break
if not types:
types.append("normal")
return types
def _pick_habitat(keywords: Sequence[str]) -> str:
habitats = [word for word in keywords if word in HABITAT_KEYWORDS]
if not habitats:
return "unknown"
return habitats[0]
def _pick_personality(keywords: Sequence[str]) -> List[str]:
result: List[str] = []
for keyword in keywords:
if keyword in PERSONALITY_KEYWORDS and keyword not in result:
result.append(keyword)
return result[:3]
def _pick_attacks(keywords: Sequence[str]) -> List[str]:
attacks: List[str] = []
for keyword in keywords:
if keyword in MOVE_KEYWORDS and keyword not in attacks:
attacks.append(keyword)
return attacks[:4]
def _pick_abilities(keywords: Sequence[str]) -> List[str]:
abilities: List[str] = []
for keyword in keywords:
if keyword in ABILITY_KEYWORDS and keyword not in abilities:
abilities.append(keyword)
return abilities[:4]
def _score_stat(base: int, keywords: Sequence[str], hints: Iterable[str]) -> int:
hint_set = set(hints)
matches = sum(1 for keyword in keywords if keyword in hint_set)
# Each match adds 10 points; keep stats in [40, 160].
return max(40, min(160, base + (matches * 10)))
def _build_description(name: str, primary_type: str, attacks: Sequence[str], abilities: Sequence[str], habitat: str) -> str:
attack_text = ", ".join(attacks) if attacks else "basic combat"
ability_text = ", ".join(abilities) if abilities else "balanced adaptation"
return (
f"{name} is a {primary_type}-type Pokemon often found in {habitat}. "
f"It commonly uses {attack_text} and shows abilities like {ability_text}."
)
def _retreat_cost_from_speed(speed: int) -> int:
if speed >= 120:
return 0
if speed >= 90:
return 1
if speed >= 70:
return 2
return 3
def _attack_damage_from_attack_stat(attack_stat: int, index: int) -> int:
# Keep card damage in simple 10-step increments.
base = 30 + max(0, attack_stat - 70) // 2
adjusted = base + (index * 10)
return max(10, min(160, (adjusted // 10) * 10))
def _energy_name_for_type(pokemon_type: str) -> str:
if pokemon_type == "normal":
return "Colorless"
return _title_case(pokemon_type)
def _fill_tcg_like_template(output: Dict[str, Any], inferred: Mapping[str, Any]) -> None:
if "name" in output and _is_empty_value(output.get("name")):
output["name"] = inferred["name"]
if "description" in output and _is_empty_value(output.get("description")):
output["description"] = inferred["description"]
if "hp" in output and _is_empty_value(output.get("hp")):
hp_value = inferred["hp"]
output["hp"] = str(hp_value) if isinstance(output.get("hp"), str) else hp_value
if "types" in output and isinstance(output.get("types"), list):
types_value = output["types"]
if len(types_value) == 0 or all(_is_empty_value(item) for item in types_value):
inferred_types = [inferred["type"]]
if inferred.get("secondary_type"):
inferred_types.append(inferred["secondary_type"])
output["types"] = inferred_types
if "stage" in output and _is_empty_value(output.get("stage")):
output["stage"] = "Basic"
if "retreat" in output and (output.get("retreat") in (None, 0, "")):
output["retreat"] = _retreat_cost_from_speed(int(inferred["speed"]))
if "weaknesses" in output and isinstance(output.get("weaknesses"), list):
weaknesses = output["weaknesses"]
if weaknesses:
weakness_type = TYPE_WEAKNESS.get(inferred["type"], "fighting")
first = weaknesses[0]
if isinstance(first, dict):
if _is_empty_value(first.get("type")):
first["type"] = weakness_type
if _is_empty_value(first.get("value")):
first["value"] = "x2"
if "attacks" in output and isinstance(output.get("attacks"), list):
attack_entries = output["attacks"]
inferred_attacks = inferred["attacks"]
inferred_type = inferred["type"]
for idx, attack_entry in enumerate(attack_entries):
if not isinstance(attack_entry, dict):
continue
attack_name = inferred_attacks[idx] if idx < len(inferred_attacks) else "tackle"
attack_title = _title_case(attack_name)
if _is_empty_value(attack_entry.get("name")):
attack_entry["name"] = attack_title
if _is_empty_value(attack_entry.get("effect")):
attack_entry["effect"] = f"Deals damage with {attack_name}."
if "damage" in attack_entry and (attack_entry.get("damage") in (None, 0, "")):
attack_entry["damage"] = _attack_damage_from_attack_stat(int(inferred["attack"]), idx)
if "cost" in attack_entry and isinstance(attack_entry.get("cost"), list):
current_cost = attack_entry["cost"]
if len(current_cost) == 0 or all(_is_empty_value(item) for item in current_cost):
attack_entry["cost"] = [_energy_name_for_type(inferred_type)]
def infer_profile_from_keywords(keywords: Sequence[str]) -> Dict[str, Any]:
cleaned = [k.strip().lower() for k in keywords if k and k.strip()]
name = _pick_name(cleaned)
types = _pick_types(cleaned)
attacks = _pick_attacks(cleaned)
abilities = _pick_abilities(cleaned)
habitat = _pick_habitat(cleaned)
personality = _pick_personality(cleaned)
hp = _score_stat(70, cleaned, STAT_HINTS["hp"])
attack = _score_stat(70, cleaned, STAT_HINTS["attack"])
defense = _score_stat(70, cleaned, STAT_HINTS["defense"])
speed = _score_stat(70, cleaned, STAT_HINTS["speed"])
return {
"name": name,
"type": types[0],
"secondary_type": types[1] if len(types) > 1 else None,
"attacks": attacks,
"abilities": abilities,
"habitat": habitat,
"personality": personality,
"hp": hp,
"attack": attack,
"defense": defense,
"speed": speed,
"keywords": cleaned,
"description": _build_description(name, types[0], attacks, abilities, habitat),
}
def fill_template_from_keywords(template: Mapping[str, Any], keywords: Sequence[str]) -> Dict[str, Any]:
"""Fill a key-only template by inferring values from keywords.
Existing non-empty values in template are preserved.
"""
inferred = infer_profile_from_keywords(keywords)
output: Dict[str, Any] = deepcopy(dict(template))
if not output:
return inferred
_fill_tcg_like_template(output, inferred)
for key, current_value in output.items():
canonical = _canonical_key(key)
if canonical not in inferred:
continue
if _is_empty_value(current_value):
output[key] = inferred[canonical]
return output

View File

@@ -0,0 +1,35 @@
{
"category": "Pokemon",
"name": "",
"rarity": "",
"hp": "",
"types": [""],
"evolveFrom": "",
"description": "",
"stage": "",
"attacks": [
{
"cost": [""],
"name": "",
"effect": ""
},
{
"cost": [""],
"name": "",
"effect": "",
"damage": 0
}
],
"weaknesses": [
{
"type": "",
"value": ""
}
],
"retreat": 0,
"regulationMark": "",
"legal": {
"standard": true,
"expanded": true
}
}

View File

@@ -0,0 +1,248 @@
"""Rule-based keyword extraction and normalization for Pokemon card generation."""
from __future__ import annotations
import math
import re
from dataclasses import dataclass, field
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple
DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = {
"normal": ["basic", "common", "regular", "plain", "normaltype"],
"fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"],
"water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"],
"grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"],
"flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"],
"fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"],
"poison": ["toxic", "venom", "acid", "poisonous", "toxin"],
"electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"],
"ground": ["earth", "soil", "sand", "mud", "quake", "dust"],
"rock": ["stone", "boulder", "crystal", "rocky", "pebble"],
"psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"],
"ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"],
"bug": ["insect", "ant", "beetle", "spider", "crawler"],
"ghost": ["spirit", "phantom", "haunt", "shadow", "specter"],
"steel": ["metal", "iron", "armor", "blade", "alloy"],
"dragon": ["drake", "wyrm", "serpent", "legendary"],
"dark": ["shadow", "evil", "night", "doom", "darkness"],
"fairy": ["magic", "magical", "sparkle", "light", "charm"],
"explosion": ["explosive", "explode", "blast"],
}
DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB")
DEFAULT_IGNORED_KEYWORDS: Set[str] = {"preevolution", "pokmon"}
DEFAULT_POS_WEIGHTS: Dict[str, float] = {
"NOUN": 3.0,
"ADJ": 2.0,
"VERB": 1.0,
}
DEFAULT_KEEP_RATIO = 0.8
DEFAULT_MIN_KEYWORDS = 12
DEFAULT_MAX_KEYWORDS = 30
def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]:
"""Build synonym -> canonical mapping for O(1) normalization lookup."""
inverse: Dict[str, str] = {}
for canonical, synonyms in normalization_map.items():
canonical_normalized = canonical.strip().lower()
inverse[canonical_normalized] = canonical_normalized
for synonym in synonyms:
synonym_normalized = synonym.strip().lower()
if synonym_normalized:
inverse[synonym_normalized] = canonical_normalized
return inverse
def _tokenize_keyword_phrase(value: str) -> List[str]:
return re.findall(r"[a-z0-9]+", value.lower())
@dataclass
class KeywordExtractor:
"""Deterministic spaCy + YAKE + rule-based normalization pipeline."""
nlp: Any
normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP)
allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS)
ignored_keywords: Set[str] = field(default_factory=lambda: set(DEFAULT_IGNORED_KEYWORDS))
pos_weights: Mapping[str, float] = field(default_factory=lambda: DEFAULT_POS_WEIGHTS)
keep_ratio: float = DEFAULT_KEEP_RATIO
min_keywords: int = DEFAULT_MIN_KEYWORDS
max_keywords: int = DEFAULT_MAX_KEYWORDS
use_yake: bool = True
def __post_init__(self) -> None:
self._normalization_lookup = _invert_normalization_map(self.normalization_map)
self._allowed_pos_set = set(self.allowed_pos)
self._ignored_keywords = {keyword.lower().strip() for keyword in self.ignored_keywords}
self._pos_weight_lookup = {k.upper(): float(v) for k, v in self.pos_weights.items()}
@classmethod
def from_default_model(
cls,
model_name: str = "en_core_web_sm",
normalization_map: Optional[Mapping[str, Iterable[str]]] = None,
allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS,
ignored_keywords: Optional[Set[str]] = None,
pos_weights: Mapping[str, float] = DEFAULT_POS_WEIGHTS,
keep_ratio: float = DEFAULT_KEEP_RATIO,
min_keywords: int = DEFAULT_MIN_KEYWORDS,
max_keywords: int = DEFAULT_MAX_KEYWORDS,
use_yake: bool = True,
) -> "KeywordExtractor":
"""Initialize extractor with a spaCy English pipeline."""
try:
import spacy
nlp = spacy.load(model_name)
except OSError as exc:
raise OSError(
f"spaCy model '{model_name}' is not installed. "
"Run: python -m spacy download en_core_web_sm"
) from exc
except Exception as exc:
raise RuntimeError(
"spaCy could not be loaded in this Python environment. "
"Try Python 3.13 or lower, then install spaCy and en_core_web_sm."
) from exc
return cls(
nlp=nlp,
normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP,
allowed_pos=allowed_pos,
ignored_keywords=ignored_keywords or set(DEFAULT_IGNORED_KEYWORDS),
pos_weights=pos_weights,
keep_ratio=keep_ratio,
min_keywords=min_keywords,
max_keywords=max_keywords,
use_yake=use_yake,
)
def extract(self, text: str) -> List[str]:
"""Extract, normalize and rank keywords from already-cleaned text."""
if not text or not text.strip():
return []
doc = self.nlp(text)
# Step 1: POS filtering + lowercase lemma/token extraction.
raw_keywords: List[Tuple[str, str]] = []
for token in doc:
if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set:
continue
base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip()
if base and base not in self._ignored_keywords:
raw_keywords.append((base, token.pos_))
# Step 2: Deduplicate before domain normalization.
deduplicated: List[Tuple[str, str]] = []
seen_raw: Set[str] = set()
for keyword, pos in raw_keywords:
if keyword in seen_raw:
continue
seen_raw.add(keyword)
deduplicated.append((keyword, pos))
# Step 3: Normalize and deduplicate canonical forms.
unique_entries: List[Tuple[str, str, str, int]] = []
seen_normalized: Set[str] = set()
for index, (original_keyword, pos) in enumerate(deduplicated):
normalized_keyword = self._normalize_keyword(original_keyword)
if normalized_keyword in seen_normalized:
continue
seen_normalized.add(normalized_keyword)
unique_entries.append((original_keyword, normalized_keyword, pos, index))
if not unique_entries:
return []
if not self.use_yake:
return [normalized_keyword for _, normalized_keyword, _, _ in unique_entries]
# Step 4: YAKE scoring + conservative selection to preserve detail.
yake_scores = self._extract_yake_scores(text)
if not yake_scores:
return [normalized_keyword for _, normalized_keyword, _, _ in unique_entries]
ranked: List[Tuple[float, int, str]] = []
for original_keyword, normalized_keyword, pos, index in unique_entries:
score_candidates: List[float] = []
if original_keyword in yake_scores:
score_candidates.append(yake_scores[original_keyword])
if normalized_keyword in yake_scores:
score_candidates.append(yake_scores[normalized_keyword])
# Missing score is treated as moderately relevant to avoid over-pruning.
yake_penalty = min(score_candidates) if score_candidates else 0.45
pos_weight = self._pos_weight_lookup.get(pos.upper(), 1.0)
combined_score = (1.0 - yake_penalty) * pos_weight
ranked.append((combined_score, index, normalized_keyword))
target_count = self._compute_target_count(len(ranked))
ranked.sort(key=lambda item: (-item[0], item[1]))
selected = ranked[:target_count]
selected.sort(key=lambda item: item[1])
return [keyword for _, _, keyword in selected]
def _compute_target_count(self, total_keywords: int) -> int:
if total_keywords <= 0:
return 0
target = max(self.min_keywords, math.ceil(total_keywords * self.keep_ratio))
if self.max_keywords > 0:
target = min(target, self.max_keywords)
return min(target, total_keywords)
def _extract_yake_scores(self, text: str) -> Dict[str, float]:
try:
import yake
except Exception:
return {}
text_token_count = len(text.split())
top_n = max(20, min(80, text_token_count * 2))
try:
extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, top=top_n)
phrase_scores = extractor.extract_keywords(text)
except Exception:
return {}
token_scores: Dict[str, float] = {}
for phrase, score in phrase_scores:
for token in _tokenize_keyword_phrase(phrase):
existing = token_scores.get(token)
if existing is None or score < existing:
token_scores[token] = score
if not token_scores:
return {}
values = list(token_scores.values())
min_score = min(values)
max_score = max(values)
if math.isclose(min_score, max_score):
return {token: 0.5 for token in token_scores}
# Normalize so 0.0=most important and 1.0=least important.
return {
token: (score - min_score) / (max_score - min_score)
for token, score in token_scores.items()
}
def _normalize_keyword(self, keyword: str) -> str:
keyword_lower = keyword.lower()
return self._normalization_lookup.get(keyword_lower, keyword_lower)
def extract_keywords(
text: str,
extractor: Optional[KeywordExtractor] = None,
) -> List[str]:
"""Convenience API to extract keywords with default extractor config."""
active_extractor = extractor or KeywordExtractor.from_default_model()
return active_extractor.extract(text)

View File

@@ -0,0 +1,2 @@
spacy>=3.7.0
yake>=0.4.2

View File

@@ -0,0 +1,143 @@
import unittest
from json_inference import fill_template_from_keywords, infer_profile_from_keywords
class JsonInferenceTests(unittest.TestCase):
def test_profile_inference_basics(self) -> None:
keywords = [
"zapthorn",
"electric",
"wolf",
"thunder",
"claw",
"speed",
"storm",
"agile",
"forest",
"recover",
"energy",
]
profile = infer_profile_from_keywords(keywords)
self.assertEqual(profile["name"], "Zapthorn")
self.assertEqual(profile["type"], "electric")
self.assertIn("thunder", profile["attacks"])
self.assertIn("claw", profile["attacks"])
self.assertIn("recover", profile["abilities"])
self.assertEqual(profile["habitat"], "forest")
self.assertGreaterEqual(profile["speed"], 80)
def test_fill_key_only_template(self) -> None:
template = {
"name": "",
"type": "",
"secondary_type": None,
"attacks": [],
"abilities": [],
"habitat": "",
"personality": [],
"hp": None,
"attack": None,
"defense": None,
"speed": None,
"description": "",
"keywords": [],
}
keywords = [
"furret",
"normal",
"tail",
"smash",
"tunnel",
"agile",
"cheerful",
"explore",
"endurance",
]
result = fill_template_from_keywords(template, keywords)
self.assertEqual(result["name"], "Furret")
self.assertEqual(result["type"], "normal")
self.assertIn("smash", result["attacks"])
self.assertIn("explore", result["abilities"])
self.assertEqual(result["habitat"], "tunnel")
self.assertIn("cheerful", result["personality"])
self.assertIsInstance(result["description"], str)
self.assertGreater(len(result["description"]), 20)
def test_fill_tcg_style_template(self) -> None:
template = {
"category": "Pokemon",
"name": "",
"hp": "",
"types": [""],
"description": "",
"stage": "",
"attacks": [
{"cost": [""], "name": "", "effect": ""},
{"cost": [""], "name": "", "effect": "", "damage": 0},
],
"weaknesses": [{"type": "", "value": ""}],
"retreat": 0,
}
keywords = [
"zapthorn",
"electric",
"thunder",
"claw",
"speed",
"storm",
"energy",
]
result = fill_template_from_keywords(template, keywords)
self.assertEqual(result["name"], "Zapthorn")
self.assertEqual(result["types"], ["electric"])
self.assertEqual(result["stage"], "Basic")
self.assertTrue(result["hp"].isdigit())
self.assertEqual(result["weaknesses"][0]["type"], "ground")
self.assertEqual(result["weaknesses"][0]["value"], "x2")
self.assertEqual(result["attacks"][0]["name"], "Thunder")
self.assertEqual(result["attacks"][1]["name"], "Claw")
self.assertEqual(result["attacks"][0]["cost"], ["Electric"])
self.assertGreaterEqual(result["retreat"], 0)
def test_name_fallback_to_unknown_for_generic_tokens(self) -> None:
keywords = [
"black",
"fur",
"giant",
"electric",
"claw",
"speed",
"storm",
]
profile = infer_profile_from_keywords(keywords)
self.assertEqual(profile["name"], "Unknown")
def test_preserves_existing_values(self) -> None:
template = {
"name": "CustomName",
"type": "electric",
"attacks": [],
"description": "Already set",
}
keywords = ["furret", "normal", "attack"]
result = fill_template_from_keywords(template, keywords)
self.assertEqual(result["name"], "CustomName")
self.assertEqual(result["type"], "electric")
self.assertEqual(result["description"], "Already set")
self.assertIn("attack", result["attacks"])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,166 @@
import unittest
from keyword_extractor import KeywordExtractor
class FakeToken:
def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
self.text = text
self.pos_ = pos
self.lemma_ = lemma
self.is_stop = is_stop
self.is_punct = not any(ch.isalnum() for ch in text)
class FakeNLP:
def __init__(self, tag_map, stopwords) -> None:
self.tag_map = tag_map
self.stopwords = stopwords
def __call__(self, text: str):
tokens = []
for raw in text.split():
token_text = raw.strip()
lowered = token_text.lower()
tokens.append(
FakeToken(
text=token_text,
pos=self.tag_map.get(lowered, "NOUN"),
lemma=lowered,
is_stop=lowered in self.stopwords,
)
)
return tokens
class TestableKeywordExtractor(KeywordExtractor):
def __init__(self, *args, yake_scores=None, **kwargs):
super().__init__(*args, **kwargs)
self._test_yake_scores = yake_scores or {}
def _extract_yake_scores(self, text: str):
return self._test_yake_scores
class KeywordExtractorTests(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
tag_map = {
"fiery": "ADJ",
"dragon": "NOUN",
"attack": "VERB",
"explosive": "ADJ",
"flames": "NOUN",
"burning": "ADJ",
"creature": "NOUN",
"with": "ADP",
"blaze": "NOUN",
"and": "CCONJ",
"dangerous": "ADJ",
"electric": "ADJ",
"mouse": "NOUN",
"using": "VERB",
"thunder": "NOUN",
"shock": "NOUN",
"strong": "ADJ",
"furret": "NOUN",
"long": "ADJ",
"slender": "ADJ",
"soft": "ADJ",
"fur": "NOUN",
"flexible": "ADJ",
"body": "NOUN",
"move": "VERB",
"gracefully": "ADJ",
"narrow": "ADJ",
"tunnel": "NOUN",
"tail": "NOUN",
"smash": "VERB",
"opponent": "NOUN",
"battle": "NOUN",
"cheerful": "ADJ",
"endurance": "NOUN",
}
stopwords = {
"a",
"very",
"and",
"with",
"the",
"it",
"to",
"its",
"that",
"through",
"in",
}
cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
cls.extractor = KeywordExtractor(nlp=cls.nlp, use_yake=False)
def test_readme_main_example(self) -> None:
text = "fiery dragon attack explosive flames"
result = self.extractor.extract(text)
self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])
def test_synonym_normalization(self) -> None:
text = "burning creature with blaze power"
result = self.extractor.extract(text)
self.assertEqual(result, ["fire", "creature", "power"])
def test_mixed_types(self) -> None:
text = "electric mouse using thunder shock"
result = self.extractor.extract(text)
self.assertEqual(result, ["electric", "mouse", "using"])
def test_noise_input(self) -> None:
text = "a very very strong and dangerous creature"
result = self.extractor.extract(text)
self.assertEqual(result, ["strong", "dangerous", "creature"])
def test_yake_keeps_detailed_information(self) -> None:
text = (
"furret long slender creature soft fur flexible body move gracefully narrow tunnel "
"tail smash opponent battle cheerful endurance"
)
yake_scores = {
"furret": 0.00,
"creature": 0.05,
"tail": 0.08,
"battle": 0.10,
"smash": 0.12,
"tunnel": 0.14,
"endurance": 0.18,
"body": 0.20,
"cheerful": 0.22,
"slender": 0.26,
"flexible": 0.28,
"gracefully": 0.34,
"narrow": 0.40,
"long": 0.42,
"soft": 0.44,
"fur": 0.45,
"move": 0.48,
"opponent": 0.52,
}
extractor = TestableKeywordExtractor(
nlp=self.nlp,
use_yake=True,
keep_ratio=0.8,
min_keywords=10,
max_keywords=30,
yake_scores=yake_scores,
)
result = extractor.extract(text)
self.assertGreaterEqual(len(result), 10)
self.assertIn("furret", result)
self.assertIn("creature", result)
self.assertIn("tail", result)
self.assertIn("tunnel", result)
if __name__ == "__main__":
unittest.main()

146
fetch_card.py Normal file
View File

@@ -0,0 +1,146 @@
#!/usr/bin/env python3
"""
Download Pokémon TCG card images with embedded JSON metadata.
Uses the TCGdex SDK to:
1. List all sets (with configurable limit)
2. For each set, list all cards (with configurable limit)
3. Download each card image (PNG) and embed full card data as PNG metadata
"""
import json
import io
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import asdict, is_dataclass
from pathlib import Path
from PIL import Image, PngImagePlugin
from tcgdexsdk import TCGdex, Language
from tcgdexsdk.enums import Quality, Extension
# ── Configuration ──────────────────────────────────────────────
MAX_SETS = 10000 # Number of sets to process (None = all)
MAX_CARDS_PER_SET = 10000 # Number of cards per set (None = all)
OUTPUT_DIR = Path(__file__).resolve().parent / "cards"
IMAGE_QUALITY = Quality.HIGH
MAX_WORKERS = 8 # Parallel download threads
# ───────────────────────────────────────────────────────────────
def card_to_dict(card) -> dict:
"""Convert a card object to a JSON-serialisable dict, skipping SDK internals."""
data = {}
skip = {"sdk", "get_image", "get_image_url"}
for attr in dir(card):
if attr.startswith("_") or attr in skip:
continue
val = getattr(card, attr, None)
if callable(val):
continue
data[attr] = _serialise(val)
return data
def _serialise(obj):
"""Recursively convert dataclass / nested objects to plain dicts."""
if obj is None or isinstance(obj, (str, int, float, bool)):
return obj
if is_dataclass(obj) and not isinstance(obj, type):
return {
k: _serialise(v)
for k, v in asdict(obj).items()
if k != "sdk"
}
if isinstance(obj, list):
return [_serialise(i) for i in obj]
if isinstance(obj, dict):
return {k: _serialise(v) for k, v in obj.items()}
# Fallback: try dataclass-style attribute extraction
if hasattr(obj, "__dict__"):
return {
k: _serialise(v)
for k, v in obj.__dict__.items()
if k != "sdk"
}
return str(obj)
def save_image_with_metadata(image_bytes: bytes, metadata: dict, path: Path):
"""Save a PNG image with JSON metadata embedded in a tEXt chunk."""
img = Image.open(io.BytesIO(image_bytes))
png_info = PngImagePlugin.PngInfo()
png_info.add_text("pokemon_metadata", json.dumps(metadata, ensure_ascii=False))
path.parent.mkdir(parents=True, exist_ok=True)
img.save(str(path), "PNG", pnginfo=png_info)
def process_card(card_id: str, set_dir: Path) -> str | None:
"""Fetch card data + image and save. Returns card description on success."""
sdk = TCGdex(Language.EN)
card = sdk.card.getSync(card_id)
if not card:
return None
resp = card.get_image(IMAGE_QUALITY, Extension.PNG)
image_bytes = resp.read()
metadata = card_to_dict(card)
filename = f"{card.localId}.png"
save_image_with_metadata(image_bytes, metadata, set_dir / filename)
return f"{card.name} ({card.id})"
def main():
sdk = TCGdex(Language.EN)
# 1. Get sets
all_sets = sdk.set.listSync()
if not all_sets:
print("No sets returned.")
return
sets_to_process = all_sets[:MAX_SETS] if MAX_SETS else all_sets
print(f"Processing {len(sets_to_process)} / {len(all_sets)} sets\n")
total_downloaded = 0
for si, set_resume in enumerate(sets_to_process, 1):
full_set = sdk.set.getSync(set_resume.id)
if not full_set or not full_set.cards:
print(f"[{si}] {set_resume.name}: no cards, skipping")
continue
cards = full_set.cards[:MAX_CARDS_PER_SET] if MAX_CARDS_PER_SET else full_set.cards
card_total = full_set.cardCount.total if full_set.cardCount else len(full_set.cards)
print(f"[{si}/{len(sets_to_process)}] {set_resume.name}{len(cards)}/{card_total} cards")
set_dir = OUTPUT_DIR / set_resume.id
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
futures = {
pool.submit(process_card, cr.id, set_dir): cr.id
for cr in cards
}
for future in as_completed(futures):
card_id = futures[future]
try:
result = future.result()
if result:
total_downloaded += 1
print(f" {result}")
else:
print(f" {card_id}: skipped")
except Exception as e:
print(f" {card_id}: failed ({e})")
print()
print(f"Done — {total_downloaded} cards saved to {OUTPUT_DIR}")
if __name__ == "__main__":
main()

BIN
pokeball.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.6 KiB

File diff suppressed because one or more lines are too long

346
prompt_to_card_pipeline.py Normal file
View File

@@ -0,0 +1,346 @@
"""End-to-end prompt -> cleaned text -> inferred JSON -> generated card image.
This script is built to connect the three stages described by the user:
1) call get_clean_text(user_text) from a text-cleaning module file
2) pass cleaned text into infer_json_usage.py with --json-only --template
3) load a checkpoint and generate a card image from inferred metadata
The model-loading part is intentionally pluggable because checkpoint structures vary.
If your .pt checkpoint cannot be used directly as a callable pipeline, provide a
generator module implementing:
def build_pipeline(checkpoint_path: str, device: str): ...
def metadata_to_conditioning(meta: dict) -> str: ... # optional
"""
from __future__ import annotations
import argparse
import importlib
import importlib.util
import json
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, Mapping
def _load_module_from_file(module_file: str):
module_path = Path(module_file).resolve()
if not module_path.exists():
raise FileNotFoundError(f"Module file not found: {module_path}")
spec = importlib.util.spec_from_file_location(module_path.stem, str(module_path))
if spec is None or spec.loader is None:
raise ImportError(f"Cannot import module from file: {module_path}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
print("module successfully charged")
return module
def _load_function_from_file(module_file: str, function_name: str) -> Callable[..., Any]:
print("model charging 1")
module = _load_module_from_file(module_file)
print("model charged 1")
if not hasattr(module, function_name):
raise AttributeError(f"{module_file} has no function named '{function_name}'")
func = getattr(module, function_name)
if not callable(func):
raise TypeError(f"{function_name} in {module_file} is not callable")
return func
def _extract_json_from_output(raw: str) -> Mapping[str, Any]:
print("_extract_json_from_output")
stripped = raw.strip()
if not stripped:
raise ValueError("Inference command returned empty output")
try:
parsed = json.loads(stripped)
if not isinstance(parsed, dict):
raise ValueError("Inference output is JSON but not an object")
return parsed
except json.JSONDecodeError:
pass
# Fallback: parse the last JSON object in mixed stdout.
last_open = stripped.rfind("{")
last_close = stripped.rfind("}")
if last_open == -1 or last_close == -1 or last_close <= last_open:
raise ValueError(f"Could not parse JSON from inference output:\n{raw}")
candidate = stripped[last_open : last_close + 1]
parsed = json.loads(candidate)
print("json parsed with success")
if not isinstance(parsed, dict):
raise ValueError("Parsed fallback JSON is not an object")
return parsed
def run_infer_json_cli(
infer_script_path: str,
template_path: str,
cleaned_text: str,
python_executable: str | None = None,
) -> Mapping[str, Any]:
infer_script = Path(infer_script_path).resolve()
print("run_infer_json_cli")
if not infer_script.exists():
raise FileNotFoundError(f"infer_json_usage.py not found: {infer_script}")
template_file = Path(template_path).resolve()
if not template_file.exists():
raise FileNotFoundError(f"Template file not found: {template_file}")
cmd = [
python_executable or sys.executable,
str(infer_script),
"--json-only",
"--template",
str(template_file),
cleaned_text,
]
print("will start result")
result = subprocess.run(cmd, capture_output=True, text=True, check=False)
if result.returncode != 0:
stderr = result.stderr.strip()
raise RuntimeError(
"JSON inference command failed. "
f"exit={result.returncode}, stderr={stderr or '<empty>'}"
)
print("result is done")
return _extract_json_from_output(result.stdout)
def default_metadata_to_conditioning(meta: Mapping[str, Any]) -> str:
print("default_metadata_to_conditioning")
name = str(meta.get("name", "Unknown Pokemon"))
types = meta.get("types") or []
if isinstance(types, list):
type_text = ", ".join(str(item) for item in types if item) or str(meta.get("type", "normal"))
else:
type_text = str(meta.get("type", "normal"))
attacks = meta.get("attacks") or []
attack_names = []
if isinstance(attacks, list):
for attack in attacks:
if isinstance(attack, dict):
value = attack.get("name")
if value:
attack_names.append(str(value))
elif attack:
attack_names.append(str(attack))
hp = str(meta.get("hp", "60"))
description = str(meta.get("description", ""))
parts = [
f"Pokemon trading card illustration of {name}",
f"type: {type_text}",
f"hp: {hp}",
]
if attack_names:
parts.append(f"attacks: {', '.join(attack_names[:2])}")
if description:
parts.append(f"description: {description}")
return "; ".join(parts)
@dataclass
class CheckpointCardGenerator:
checkpoint_path: str
device: str = "cpu"
generator_module_path: str = ""
def __post_init__(self) -> None:
self._pipe = self._build_pipe()
self._metadata_to_conditioning = self._build_conditioning_function()
def _build_pipe(self):
if self.generator_module_path:
print("getting module")
module = _load_module_from_file(self.generator_module_path)
print("module got")
if not hasattr(module, "build_pipeline"):
raise AttributeError(
"Custom generator module must define build_pipeline(checkpoint_path, device)."
)
print("building pipeline")
build_pipeline = getattr(module, "build_pipeline")
if not callable(build_pipeline):
raise TypeError("build_pipeline exists but is not callable")
print("pipeline build")
return build_pipeline(self.checkpoint_path, self.device)
# Best-effort direct checkpoint loading for simple callable pipeline dumps.
try:
torch = importlib.import_module("torch")
except ModuleNotFoundError as exc:
raise RuntimeError(
"torch is required to load checkpoint files. Install torch or provide --generator-module."
) from exc
print("loading checkpoint")
checkpoint = torch.load(self.checkpoint_path, map_location=self.device)
print("checkpoint loaded")
if callable(checkpoint):
return checkpoint
if isinstance(checkpoint, dict):
for key in ("pipe", "pipeline", "model"):
candidate = checkpoint.get(key)
if callable(candidate):
return candidate
raise RuntimeError(
"Could not construct a callable generation pipeline from checkpoint. "
"Pass --generator-module with a build_pipeline() function for your model layout."
)
def _build_conditioning_function(self) -> Callable[[Mapping[str, Any]], str]:
if self.generator_module_path:
print("model charge 2")
module = _load_module_from_file(self.generator_module_path)
print("model charged 2")
if hasattr(module, "metadata_to_conditioning"):
func = getattr(module, "metadata_to_conditioning")
if callable(func):
return func
return default_metadata_to_conditioning
def generate_card_from_metadata(
self,
meta: Mapping[str, Any],
num_inference_steps: int = 30,
guidance_scale: float = 7.5,
save_path: str | None = None,
):
conditioning = self._metadata_to_conditioning(meta)
result = self._pipe(
conditioning,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
)
if not hasattr(result, "images") or not result.images:
raise RuntimeError(
"Pipeline call did not return an object with non-empty .images. "
"Ensure your pipeline follows diffusers-style output."
)
image = result.images[0]
if save_path:
output_file = Path(save_path).resolve()
output_file.parent.mkdir(parents=True, exist_ok=True)
image.save(str(output_file))
return image
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Run text cleaning + JSON inference + card generation in one command.",
)
parser.add_argument("text", help="User input text.")
parser.add_argument(
"--text-cleaner-path",
required=True,
help="Path to text-cleaning-pipeline.py that defines get_clean_text(text).",
)
parser.add_argument(
"--infer-script-path",
required=True,
help="Path to infer_json_usage.py.",
)
parser.add_argument(
"--template",
required=True,
help="Path to JSON template file.",
)
parser.add_argument(
"--checkpoint",
required=True,
help="Path to model checkpoint (example: pokemon_card_lora/training_history.pt).",
)
parser.add_argument(
"--generator-module",
default="",
help="Optional module path defining build_pipeline() and metadata_to_conditioning().",
)
parser.add_argument("--device", default="cpu", help="Checkpoint loading device (default: cpu).")
parser.add_argument("--num-inference-steps", type=int, default=30)
parser.add_argument("--guidance-scale", type=float, default=7.5)
parser.add_argument("--save-path", default="generated_card.png")
parser.add_argument(
"--python-executable",
default=sys.executable,
help="Python executable used to run infer_json_usage.py (default: current interpreter).",
)
parser.add_argument(
"--print-json",
action="store_true",
help="Print inferred JSON to stdout.",
)
parser.add_argument(
"--print-clean-text",
action="store_true",
help="Print cleaned text to stdout.",
)
return parser
def main() -> None:
args = _build_parser().parse_args()
print("main get clean text")
get_clean_text = _load_function_from_file(args.text_cleaner_path, "get_clean_text")
print("main got clean text")
cleaned_text = get_clean_text(args.text)
print("main got args.text")
if not isinstance(cleaned_text, str):
raise TypeError("get_clean_text(...) must return a string")
print("main get inferred")
inferred_json = run_infer_json_cli(
infer_script_path=args.infer_script_path,
template_path=args.template,
cleaned_text=cleaned_text,
python_executable=args.python_executable,
)
print("main got inferred")
print("main get generator")
generator = CheckpointCardGenerator(
checkpoint_path=args.checkpoint,
device=args.device,
generator_module_path=args.generator_module,
)
print("main got generator and will generate card")
generator.generate_card_from_metadata(
inferred_json,
num_inference_steps=args.num_inference_steps,
guidance_scale=args.guidance_scale,
save_path=args.save_path,
)
print("main card generated")
if args.print_clean_text:
print(cleaned_text)
if args.print_json:
print(json.dumps(inferred_json, indent=2))
print(f"Card generated and saved to: {args.save_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,298 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 🎴 Génération de Carte Pokémon depuis un Texte Descriptif\n",
"## Partie 1 — Nettoyage du Texte (NLU Pipeline)\n",
"\n",
"On prend un texte descriptif fourni par l'utilisateur et on le nettoie étape par étape.\n",
"\n",
"```\n",
"Texte brut → Noise Removal → Tokenization → Stopwords → Lemmatization → Texte propre\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## 📦 Installation des dépendances"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mRunning cells with 'Python 3.12.3' requires the ipykernel package.\n",
"\u001b[1;31m<a href='command:jupyter.createPythonEnvAndSelectController'>Create a Python Environment</a> with the required packages.\n",
"\u001b[1;31mOr install 'ipykernel' using the command: '/usr/bin/python3 -m pip install ipykernel -U --user --force-reinstall'"
]
}
],
"source": [
"!pip install nltk --quiet\n",
"\n",
"import nltk\n",
"nltk.download('punkt', quiet=True)\n",
"nltk.download('punkt_tab', quiet=True)\n",
"nltk.download('stopwords', quiet=True)\n",
"nltk.download('wordnet', quiet=True)\n",
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
"nltk.download('averaged_perceptron_tagger_eng', quiet=True)\n",
"\n",
"print(\"✅ Dépendances installées !\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## 📝 Saisie du texte utilisateur"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"raw_text = \"\"\"\n",
"This is a HUGE fire dragon!!! It has got massive red wings and shoots \n",
"powerfull flames from its mouth... It's super fast n really strong!!\n",
"Its body is coverd with shiny golden scales & it lives in volcanos.\n",
"it luv to fight other pokémons and is very very aggressive >:(\n",
"\"\"\"\n",
"\n",
"print(\"📄 Texte brut :\")\n",
"print(raw_text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## 🧹 Étape 1 — Noise Removal\n",
"\n",
"On supprime la ponctuation, les caractères spéciaux, les mots trop courts, et on met tout en minuscules.\n",
"\n",
"> 📖 *Cours page 25-29 — `removePunctuation`, `removeShortWords`, `removePattern`*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import string\n",
"\n",
"def remove_punctuation(text):\n",
" \"\"\"Supprime la ponctuation du texte.\"\"\"\n",
" mapping_table = text.maketrans('', '', string.punctuation)\n",
" return text.translate(mapping_table)\n",
"\n",
"def remove_special_chars(text):\n",
" \"\"\"Supprime les caractères non-ASCII (emojis, accents parasites...).\"\"\"\n",
" text = text.encode('ascii', 'ignore').decode('ascii')\n",
" text = re.sub(r'[^a-zA-Z\\s]', ' ', text)\n",
" return re.sub(r'\\s+', ' ', text).strip()\n",
"\n",
"def remove_short_words(text, min_len=3):\n",
" \"\"\"Supprime les mots de moins de min_len caractères.\"\"\"\n",
" return \" \".join([word for word in text.split() if len(word) >= min_len])\n",
"\n",
"# Application\n",
"text = raw_text.lower() # minuscules\n",
"text = remove_punctuation(text) # ponctuation\n",
"text = remove_special_chars(text) # caractères spéciaux\n",
"text = remove_short_words(text) # mots trop courts\n",
"\n",
"print(\"🔇 Après Noise Removal :\")\n",
"print(text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## 📖 Étape 2 — Object Standardization\n",
"\n",
"On remplace les abréviations et l'argot par leurs formes standard.\n",
"\n",
"> 📖 *Cours page 38 — lookup table `standardize`*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"SLANG_LOOKUP = {\n",
" \"n\": \"and\",\n",
" \"luv\": \"love\",\n",
" \"r\": \"are\",\n",
" \"u\": \"you\",\n",
" \"ur\": \"your\",\n",
" \"gonna\": \"going to\",\n",
" \"wanna\": \"want to\",\n",
" \"gotta\": \"got to\",\n",
" \"pokemons\": \"pokemon\",\n",
" \"pokmons\": \"pokemon\",\n",
"}\n",
"\n",
"def standardize(text, lookup=SLANG_LOOKUP):\n",
" \"\"\"Remplace les mots d'argot par leur forme standard.\"\"\"\n",
" words = text.split()\n",
" return \" \".join([lookup.get(word, word) for word in words])\n",
"\n",
"text = standardize(text)\n",
"\n",
"print(\"📖 Après Standardisation :\")\n",
"print(text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## ✂️ Étape 3 — Tokenization\n",
"\n",
"On découpe le texte en tokens individuels.\n",
"\n",
"> 📖 *Cours page 31 — `word_tokenize` (NLTK)*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nltk import word_tokenize\n",
"\n",
"tokens = word_tokenize(text)\n",
"\n",
"print(f\"✂️ {len(tokens)} tokens :\")\n",
"print(tokens)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## 🚫 Étape 4 — Suppression des Stopwords\n",
"\n",
"On retire les mots grammaticaux qui n'apportent pas de sens (\"the\", \"is\", \"a\"...).\n",
"\n",
"> 📖 *Cours page 27 — `cleanTextGT` avec `stopwords` (NLTK)*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"\n",
"stop_words = set(stopwords.words('english'))\n",
"\n",
"tokens = [token for token in tokens if token not in stop_words]\n",
"\n",
"print(\"🚫 Tokens après suppression des stopwords :\")\n",
"print(tokens)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## 🌿 Étape 5 — Lemmatization\n",
"\n",
"On réduit chaque mot à sa forme racine (`flames → flame`, `shooting → shoot`). On utilise le POS tag pour plus de précision.\n",
"\n",
"> 📖 *Cours page 36-37 — `WordNetLemmatizer` + POS tag*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nltk.stem.wordnet import WordNetLemmatizer\n",
"from nltk import pos_tag\n",
"from nltk.corpus import wordnet\n",
"\n",
"lem = WordNetLemmatizer()\n",
"\n",
"def get_wordnet_pos(treebank_tag):\n",
" \"\"\"Convertit les tags Penn Treebank en tags WordNet.\"\"\"\n",
" if treebank_tag.startswith('J'): return wordnet.ADJ\n",
" elif treebank_tag.startswith('V'): return wordnet.VERB\n",
" elif treebank_tag.startswith('N'): return wordnet.NOUN\n",
" elif treebank_tag.startswith('R'): return wordnet.ADV\n",
" else: return wordnet.NOUN\n",
"\n",
"pos_tags = pos_tag(tokens)\n",
"tokens = [lem.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]\n",
"\n",
"print(\"🌿 Tokens après Lemmatization :\")\n",
"print(tokens)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## ✅ Résultat final — Texte nettoyé"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clean_text = \" \".join(tokens)\n",
"\n",
"print(\"📄 Texte brut :\")\n",
"print(raw_text.strip())\n",
"print()\n",
"print(\"✅ Texte nettoyé :\")\n",
"print(clean_text)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,158 @@
"""Reusable text-cleaning pipeline for Pokemon descriptions.
This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API:
- no input() calls
- no print side effects
- deterministic output for a given input
"""
from __future__ import annotations
import re
import string
from typing import Any, Dict, List
SLANG_LOOKUP: Dict[str, str] = {
"n": "and",
"luv": "love",
"r": "are",
"u": "you",
"ur": "your",
"gonna": "going to",
"wanna": "want to",
"gotta": "got to",
"pokemons": "pokemon",
"pokmons": "pokemon",
"bcz": "because",
}
_NLTK_RESOURCES = [
"punkt",
"punkt_tab",
"stopwords",
"wordnet",
"averaged_perceptron_tagger",
"averaged_perceptron_tagger_eng",
]
def _import_nltk() -> Any:
"""Import NLTK lazily so this module can be imported before deps are installed."""
try:
import nltk # type: ignore
except ModuleNotFoundError as exc:
raise RuntimeError(
"NLTK is not installed. Install project dependencies with: pip install -r requirements.txt"
) from exc
return nltk
def ensure_nltk_resources(quiet: bool = True) -> None:
"""Download required NLTK resources if missing.
Safe to call at app startup (including inside Streamlit).
"""
nltk = _import_nltk()
for resource in _NLTK_RESOURCES:
try:
nltk.download(resource, quiet=quiet)
except Exception as exc:
raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc
def remove_punctuation(text: str) -> str:
mapping_table = text.maketrans("", "", string.punctuation)
return text.translate(mapping_table)
def remove_special_chars(text: str) -> str:
text = text.encode("ascii", "ignore").decode("ascii")
text = re.sub(r"[^a-zA-Z\s]", " ", text)
return re.sub(r"\s+", " ", text).strip()
def remove_short_words(text: str, min_len: int = 3) -> str:
return " ".join(word for word in text.split() if len(word) >= min_len)
def remove_alphanum_words(text: str) -> str:
words = text.split()
cleaned = [
word
for word in words
if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word))
]
return " ".join(cleaned)
def standardize(text: str, lookup: Dict[str, str] | None = None) -> str:
mapping = lookup or SLANG_LOOKUP
return " ".join(mapping.get(word, word) for word in text.split())
def _get_wordnet_pos(treebank_tag: str) -> str:
nltk = _import_nltk()
wordnet = nltk.corpus.wordnet
if treebank_tag.startswith("J"):
return wordnet.ADJ
if treebank_tag.startswith("V"):
return wordnet.VERB
if treebank_tag.startswith("N"):
return wordnet.NOUN
if treebank_tag.startswith("R"):
return wordnet.ADV
return wordnet.NOUN
def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]:
"""Run the full cleaning pipeline and return intermediate + final outputs.
Returns a dictionary so a UI can display each stage if desired.
"""
if not isinstance(raw_text, str):
raise TypeError("raw_text must be a string")
nltk = _import_nltk()
pos_tag = nltk.pos_tag
word_tokenize = nltk.word_tokenize
stopwords = nltk.corpus.stopwords
WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer
ensure_nltk_resources(quiet=True)
text = raw_text.lower()
text = remove_punctuation(text)
text = remove_alphanum_words(text)
text = remove_special_chars(text)
noise_removed = remove_short_words(text, min_len=min_len)
standardized = standardize(noise_removed)
tokens = word_tokenize(standardized)
stop_words = set(stopwords.words("english"))
tokens_no_stopwords = [token for token in tokens if token not in stop_words]
lem = WordNetLemmatizer()
pos_tags = pos_tag(tokens_no_stopwords)
lemmas = [
lem.lemmatize(token, _get_wordnet_pos(tag))
for token, tag in pos_tags
]
clean_text = " ".join(lemmas)
return {
"raw_text": raw_text,
"noise_removed": noise_removed,
"standardized": standardized,
"tokens": tokens,
"tokens_no_stopwords": tokens_no_stopwords,
"lemmas": lemmas,
"clean_text": clean_text,
}
def get_clean_text(raw_text: str, min_len: int = 3) -> str:
"""Small helper for app code that only needs the final cleaned text."""
return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]

View File

@@ -0,0 +1,451 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Partie 1 — Nettoyage du Texte\n",
"source .venv/bin/activate\n",
"cd \n",
"python nom du fichier\n",
"On prend un texte descriptif fourni par l'utilisateur et on le nettoie étape par étape.\n",
"\n",
"```\n",
"Texte brut → Noise Removal → Tokenization → Stopwords → Lemmatization → Texte propre\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Installation des dépendances"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dépendances installées !\n"
]
}
],
"source": [
"!pip install nltk --quiet\n",
"\n",
"import nltk\n",
"nltk.download('punkt', quiet=True)\n",
"nltk.download('punkt_tab', quiet=True)\n",
"nltk.download('stopwords', quiet=True)\n",
"nltk.download('wordnet', quiet=True)\n",
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
"nltk.download('averaged_perceptron_tagger_eng', quiet=True)\n",
"\n",
"print(\"Dépendances installées !\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Saisie du texte utilisateur"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "Interrupted by user",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[86]\u001b[39m\u001b[32m, line 65\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# test_texts = [\u001b[39;00m\n\u001b[32m 2\u001b[39m \n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# # 0 — Dragon de feu (texte original)\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 62\u001b[39m \n\u001b[32m 63\u001b[39m \u001b[38;5;66;03m# print(f\" Texte de test n°{INDEX} :\")\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m65\u001b[39m raw_text = \u001b[38;5;28;43minput\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mDécrivez votre Pokémon : \u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 67\u001b[39m \u001b[38;5;28mprint\u001b[39m(raw_text)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/lib/python3.12/site-packages/ipykernel/kernelbase.py:1403\u001b[39m, in \u001b[36mKernel.raw_input\u001b[39m\u001b[34m(self, prompt)\u001b[39m\n\u001b[32m 1401\u001b[39m msg = \u001b[33m\"\u001b[39m\u001b[33mraw_input was called, but this frontend does not support input requests.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1402\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m StdinNotImplementedError(msg)\n\u001b[32m-> \u001b[39m\u001b[32m1403\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_input_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1404\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1405\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_get_shell_context_var\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_shell_parent_ident\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1406\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_parent\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mshell\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1407\u001b[39m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 1408\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/lib/python3.12/site-packages/ipykernel/kernelbase.py:1448\u001b[39m, in \u001b[36mKernel._input_request\u001b[39m\u001b[34m(self, prompt, ident, parent, password)\u001b[39m\n\u001b[32m 1445\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[32m 1446\u001b[39m \u001b[38;5;66;03m# re-raise KeyboardInterrupt, to truncate traceback\u001b[39;00m\n\u001b[32m 1447\u001b[39m msg = \u001b[33m\"\u001b[39m\u001b[33mInterrupted by user\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m-> \u001b[39m\u001b[32m1448\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m(msg) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1449\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[32m 1450\u001b[39m \u001b[38;5;28mself\u001b[39m.log.warning(\u001b[33m\"\u001b[39m\u001b[33mInvalid Message:\u001b[39m\u001b[33m\"\u001b[39m, exc_info=\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
"\u001b[31mKeyboardInterrupt\u001b[39m: Interrupted by user"
]
}
],
"source": [
"# test_texts = [\n",
"\n",
"# # 0 — Dragon de feu (texte original)\n",
"# \"\"\"\n",
"# This is a HUGE fire dragon!!! It has got massive red wings and shoots\n",
"# powerfull flames from its mouth... It's super fast n really strong!!\n",
"# Its body is coverd with shiny golden scales & it lives in volcanos.\n",
"# it luv to fight other pokémons and is very very aggressive >:(\n",
"# I want to call it Pyrokar.\n",
"# \"\"\",\n",
"\n",
"# # 1 — Pokémon aquatique calme\n",
"# \"\"\"\n",
"# My pokemon is called Aqualis!! its a small blue sea creature w/ big\n",
"# shiny eyes... very calm n gentle :) it swims super fast in deep oceans\n",
"# and can breath underwater 4ever. it glows in the dark like a lanternfish\n",
"# and heals other pokemons with its tears!!! luv this lil guy so much omg\n",
"# \"\"\",\n",
"\n",
"# # 2 — Pokémon électrique agressif\n",
"# \"\"\"\n",
"# ZAPTHORN is da name!! its an electric wolf w/ yellow n black fur and\n",
"# giant thunder claws !!! it runz at lightning speed thru storms & shoots\n",
"# bolts from its tail... super scary n powerfull enemy 4 sure >:D\n",
"# nobody can catch it bcz it disappears in the clouds when threatened\n",
"# \"\"\",\n",
"\n",
"# # 3 — Pokémon plante timide\n",
"# \"\"\"\n",
"# i wanna name it Sylverion... its a shy deer-like pokemon covered in\n",
"# beautiful flowers n vines. it lives deep in enchanted forests & only\n",
"# comes out at nite. its antlers r made of ancient wood n bloom every\n",
"# spring!! it can make plants grow super fast around it... so magical omg\n",
"# \"\"\",\n",
"\n",
"# # 4 — Pokémon glace / fantôme\n",
"# \"\"\"\n",
"# This haunted ice spirit is called Glacyra!!! it floats thru frozen\n",
"# mountains leavin icy footprints everywhere... its body is trasnparent\n",
"# like glass n u can see its frozen heart inside >< it whispers 2 trainers\n",
"# in their sleep n freezes everything it touchez. very misunderstood tbh\n",
"# \"\"\",\n",
"\n",
"# # 5 — Pokémon combat en franglais\n",
"# \"\"\"\n",
"# My Pokémon is called Ferroknux!! It's a big metal gorilla with\n",
"# gigantic iron fists and super thick armor on its chest... it smashes\n",
"# rocks with bare hands and trains all day, every day in the mountains!!\n",
"# Very strong and very aggressive, but loyal to its trainer 4ever :)\n",
"# \"\"\",\n",
"# #6 \n",
"# \"\"\"\n",
"# Furret is a long, slender, and agile creature with soft fur and a flexible body that allows it to move gracefully through narrow tunnels and hidden pathways. This normal-type Pokémon builds intricate nests perfectly shaped to fit its elongated form, making them nearly impossible for other creatures to enter. Despite its gentle and calm nature, Furret can become surprisingly energetic in battle, using its powerful tail to smash opponents with swift and playful attacks. It is often seen wandering across fields and forests, curiously observing its surroundings, and it shares a close bond with its pre-evolution, Sentret. Known for its endurance and cheerful spirit, Furret can quickly recover its energy, always feeling fine and ready to continue exploring or fighting alongside its trainer.\n",
"# \"\"\",\n",
"\n",
"# ]\n",
"\n",
"# # 👇 Changez cet index pour tester un autre texte\n",
"# INDEX = 6\n",
"\n",
"# raw_text = test_texts[INDEX]\n",
"\n",
"# print(f\" Texte de test n°{INDEX} :\")\n",
"\n",
"raw_text = input(\"Décrivez votre Pokémon : \")\n",
"\n",
"print(raw_text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Étape 1 — Noise Removal\n",
"\n",
"On supprime la ponctuation, les caractères spéciaux, les mots trop courts, et on met tout en minuscules.\n",
"\n",
"> *Cours page 25-29 — `removePunctuation`, `removeShortWords`, `removePattern`*"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Après Noise Removal :\n",
"furret long slender and agile creature with soft fur and flexible body that allows move gracefully through narrow tunnels and hidden pathways this normaltype pokmon builds intricate nests perfectly shaped fit its elongated form making them nearly impossible for other creatures enter despite its gentle and calm nature furret can become surprisingly energetic battle using its powerful tail smash opponents with swift and playful attacks often seen wandering across fields and forests curiously observing its surroundings and shares close bond with its preevolution sentret known for its endurance and cheerful spirit furret can quickly recover its energy always feeling fine and ready continue exploring fighting alongside its trainer\n"
]
}
],
"source": [
"import re\n",
"import string\n",
"\n",
"def remove_punctuation(text):\n",
" \"\"\"Supprime la ponctuation du texte.\"\"\"\n",
" mapping_table = text.maketrans('', '', string.punctuation)\n",
" return text.translate(mapping_table)\n",
"\n",
"def remove_special_chars(text):\n",
" \"\"\"Supprime les caractères non-ASCII (emojis, accents parasites...).\"\"\"\n",
" text = text.encode('ascii', 'ignore').decode('ascii')\n",
" text = re.sub(r'[^a-zA-Z\\s]', ' ', text)\n",
" return re.sub(r'\\s+', ' ', text).strip()\n",
"\n",
"def remove_short_words(text, min_len=3):\n",
" \"\"\"Supprime les mots de moins de min_len caractères.\"\"\"\n",
" return \" \".join([word for word in text.split() if len(word) >= min_len])\n",
"\n",
"\n",
"def remove_alphanum_words(text):\n",
" \"\"\"Supprime les mots qui contiennent à la fois des lettres et des chiffres\n",
" (ex: '4ever', 'n1', '2night', 'runz4', 'mp3').\"\"\"\n",
" words = text.split()\n",
" cleaned = [word for word in words\n",
" if not (re.search(r'[a-zA-Z]', word) and re.search(r'[0-9]', word))]\n",
" return \" \".join(cleaned)\n",
"\n",
"# Application\n",
"text = raw_text.lower() # minuscules\n",
"text = remove_punctuation(text) # ponctuation\n",
"text = remove_alphanum_words(text) \n",
"text = remove_special_chars(text) # caractères spéciaux\n",
"text = remove_short_words(text) # mots trop courts\n",
"\n",
"print(\" Après Noise Removal :\")\n",
"print(text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Étape 2 — Object Standardization\n",
"\n",
"On remplace les abréviations et l'argot par leurs formes standard.\n",
"\n",
"> *Cours page 38 — lookup table `standardize`*"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Après Standardisation :\n",
"furret long slender and agile creature with soft fur and flexible body that allows move gracefully through narrow tunnels and hidden pathways this normaltype pokmon builds intricate nests perfectly shaped fit its elongated form making them nearly impossible for other creatures enter despite its gentle and calm nature furret can become surprisingly energetic battle using its powerful tail smash opponents with swift and playful attacks often seen wandering across fields and forests curiously observing its surroundings and shares close bond with its preevolution sentret known for its endurance and cheerful spirit furret can quickly recover its energy always feeling fine and ready continue exploring fighting alongside its trainer\n"
]
}
],
"source": [
"SLANG_LOOKUP = {\n",
" \"n\": \"and\",\n",
" \"luv\": \"love\",\n",
" \"r\": \"are\",\n",
" \"u\": \"you\",\n",
" \"ur\": \"your\",\n",
" \"gonna\": \"going to\",\n",
" \"wanna\": \"want to\",\n",
" \"gotta\": \"got to\",\n",
" \"pokemons\": \"pokemon\",\n",
" \"pokmons\": \"pokemon\",\n",
" \"bcz\": \"because\",\n",
"}\n",
"\n",
"def standardize(text, lookup=SLANG_LOOKUP):\n",
" \"\"\"Remplace les mots d'argot par leur forme standard.\"\"\"\n",
" words = text.split()\n",
" return \" \".join([lookup.get(word, word) for word in words])\n",
"\n",
"text = standardize(text)\n",
"\n",
"print(\" Après Standardisation :\")\n",
"print(text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Étape 3 — Tokenization\n",
"\n",
"On découpe le texte en tokens individuels.\n",
"\n",
"> *Cours page 31 — `word_tokenize` (NLTK)*"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 108 tokens :\n",
"['furret', 'long', 'slender', 'and', 'agile', 'creature', 'with', 'soft', 'fur', 'and', 'flexible', 'body', 'that', 'allows', 'move', 'gracefully', 'through', 'narrow', 'tunnels', 'and', 'hidden', 'pathways', 'this', 'normaltype', 'pokmon', 'builds', 'intricate', 'nests', 'perfectly', 'shaped', 'fit', 'its', 'elongated', 'form', 'making', 'them', 'nearly', 'impossible', 'for', 'other', 'creatures', 'enter', 'despite', 'its', 'gentle', 'and', 'calm', 'nature', 'furret', 'can', 'become', 'surprisingly', 'energetic', 'battle', 'using', 'its', 'powerful', 'tail', 'smash', 'opponents', 'with', 'swift', 'and', 'playful', 'attacks', 'often', 'seen', 'wandering', 'across', 'fields', 'and', 'forests', 'curiously', 'observing', 'its', 'surroundings', 'and', 'shares', 'close', 'bond', 'with', 'its', 'preevolution', 'sentret', 'known', 'for', 'its', 'endurance', 'and', 'cheerful', 'spirit', 'furret', 'can', 'quickly', 'recover', 'its', 'energy', 'always', 'feeling', 'fine', 'and', 'ready', 'continue', 'exploring', 'fighting', 'alongside', 'its', 'trainer']\n"
]
}
],
"source": [
"from nltk import word_tokenize\n",
"\n",
"tokens = word_tokenize(text)\n",
"\n",
"print(f\" {len(tokens)} tokens :\")\n",
"print(tokens)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Étape 4 — Suppression des Stopwords\n",
"\n",
"On retire les mots grammaticaux qui n'apportent pas de sens (\"the\", \"is\", \"a\"...).\n",
"\n",
"> *Cours page 27 — `cleanTextGT` avec `stopwords` (NLTK)*"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tokens après suppression des stopwords :\n",
"['furret', 'long', 'slender', 'agile', 'creature', 'soft', 'fur', 'flexible', 'body', 'allows', 'move', 'gracefully', 'narrow', 'tunnels', 'hidden', 'pathways', 'normaltype', 'pokmon', 'builds', 'intricate', 'nests', 'perfectly', 'shaped', 'fit', 'elongated', 'form', 'making', 'nearly', 'impossible', 'creatures', 'enter', 'despite', 'gentle', 'calm', 'nature', 'furret', 'become', 'surprisingly', 'energetic', 'battle', 'using', 'powerful', 'tail', 'smash', 'opponents', 'swift', 'playful', 'attacks', 'often', 'seen', 'wandering', 'across', 'fields', 'forests', 'curiously', 'observing', 'surroundings', 'shares', 'close', 'bond', 'preevolution', 'sentret', 'known', 'endurance', 'cheerful', 'spirit', 'furret', 'quickly', 'recover', 'energy', 'always', 'feeling', 'fine', 'ready', 'continue', 'exploring', 'fighting', 'alongside', 'trainer']\n"
]
}
],
"source": [
"from nltk.corpus import stopwords\n",
"\n",
"stop_words = set(stopwords.words('english'))\n",
"\n",
"tokens = [token for token in tokens if token not in stop_words]\n",
"\n",
"print(\"Tokens après suppression des stopwords :\")\n",
"print(tokens)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Étape 5 — Lemmatization\n",
"\n",
"On réduit chaque mot à sa forme racine (`flames → flame`, `shooting → shoot`). On utilise le POS tag pour plus de précision.\n",
"\n",
"> *Cours page 36-37 — `WordNetLemmatizer` + POS tag*"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Tokens après Lemmatization :\n",
"['furret', 'long', 'slender', 'agile', 'creature', 'soft', 'fur', 'flexible', 'body', 'allow', 'move', 'gracefully', 'narrow', 'tunnel', 'hide', 'pathway', 'normaltype', 'pokmon', 'build', 'intricate', 'nest', 'perfectly', 'shape', 'fit', 'elongated', 'form', 'make', 'nearly', 'impossible', 'creature', 'enter', 'despite', 'gentle', 'calm', 'nature', 'furret', 'become', 'surprisingly', 'energetic', 'battle', 'use', 'powerful', 'tail', 'smash', 'opponent', 'swift', 'playful', 'attack', 'often', 'see', 'wander', 'across', 'field', 'forest', 'curiously', 'observe', 'surroundings', 'share', 'close', 'bond', 'preevolution', 'sentret', 'know', 'endurance', 'cheerful', 'spirit', 'furret', 'quickly', 'recover', 'energy', 'always', 'feel', 'fine', 'ready', 'continue', 'explore', 'fight', 'alongside', 'trainer']\n"
]
}
],
"source": [
"from nltk.stem.wordnet import WordNetLemmatizer\n",
"from nltk import pos_tag\n",
"from nltk.corpus import wordnet\n",
"\n",
"lem = WordNetLemmatizer()\n",
"\n",
"def get_wordnet_pos(treebank_tag):\n",
" \"\"\"Convertit les tags Penn Treebank en tags WordNet.\"\"\"\n",
" if treebank_tag.startswith('J'): return wordnet.ADJ\n",
" elif treebank_tag.startswith('V'): return wordnet.VERB\n",
" elif treebank_tag.startswith('N'): return wordnet.NOUN\n",
" elif treebank_tag.startswith('R'): return wordnet.ADV\n",
" else: return wordnet.NOUN\n",
"\n",
"pos_tags = pos_tag(tokens)\n",
"tokens = [lem.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]\n",
"\n",
"print(\" Tokens après Lemmatization :\")\n",
"print(tokens)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Résultat final — Texte nettoyé"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"📄 Texte brut :\n",
"Furret is a long, slender, and agile creature with soft fur and a flexible body that allows it to move gracefully through narrow tunnels and hidden pathways. This normal-type Pokémon builds intricate nests perfectly shaped to fit its elongated form, making them nearly impossible for other creatures to enter. Despite its gentle and calm nature, Furret can become surprisingly energetic in battle, using its powerful tail to smash opponents with swift and playful attacks. It is often seen wandering across fields and forests, curiously observing its surroundings, and it shares a close bond with its pre-evolution, Sentret. Known for its endurance and cheerful spirit, Furret can quickly recover its energy, always feeling fine and ready to continue exploring or fighting alongside its trainer.\n",
"\n",
"Texte nettoyé :\n",
"furret long slender agile creature soft fur flexible body allow move gracefully narrow tunnel hide pathway normaltype pokmon build intricate nest perfectly shape fit elongated form make nearly impossible creature enter despite gentle calm nature furret become surprisingly energetic battle use powerful tail smash opponent swift playful attack often see wander across field forest curiously observe surroundings share close bond preevolution sentret know endurance cheerful spirit furret quickly recover energy always feel fine ready continue explore fight alongside trainer\n"
]
}
],
"source": [
"clean_text = \" \".join(tokens)\n",
"\n",
"print(\"📄 Texte brut :\")\n",
"print(raw_text.strip())\n",
"print()\n",
"print(\"Texte nettoyé :\")\n",
"print(clean_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,158 @@
"""Reusable text-cleaning pipeline for Pokemon descriptions.
This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API:
- no input() calls
- no print side effects
- deterministic output for a given input
"""
from __future__ import annotations
import re
import string
from typing import Any, Dict, List
SLANG_LOOKUP: Dict[str, str] = {
"n": "and",
"luv": "love",
"r": "are",
"u": "you",
"ur": "your",
"gonna": "going to",
"wanna": "want to",
"gotta": "got to",
"pokemons": "pokemon",
"pokmons": "pokemon",
"bcz": "because",
}
_NLTK_RESOURCES = [
"punkt",
"punkt_tab",
"stopwords",
"wordnet",
"averaged_perceptron_tagger",
"averaged_perceptron_tagger_eng",
]
def _import_nltk() -> Any:
"""Import NLTK lazily so this module can be imported before deps are installed."""
try:
import nltk # type: ignore
except ModuleNotFoundError as exc:
raise RuntimeError(
"NLTK is not installed. Install project dependencies with: pip install -r requirements.txt"
) from exc
return nltk
def ensure_nltk_resources(quiet: bool = True) -> None:
"""Download required NLTK resources if missing.
Safe to call at app startup (including inside Streamlit).
"""
nltk = _import_nltk()
for resource in _NLTK_RESOURCES:
try:
nltk.download(resource, quiet=quiet)
except Exception as exc:
raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc
def remove_punctuation(text: str) -> str:
mapping_table = text.maketrans("", "", string.punctuation)
return text.translate(mapping_table)
def remove_special_chars(text: str) -> str:
text = text.encode("ascii", "ignore").decode("ascii")
text = re.sub(r"[^a-zA-Z\s]", " ", text)
return re.sub(r"\s+", " ", text).strip()
def remove_short_words(text: str, min_len: int = 3) -> str:
return " ".join(word for word in text.split() if len(word) >= min_len)
def remove_alphanum_words(text: str) -> str:
words = text.split()
cleaned = [
word
for word in words
if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word))
]
return " ".join(cleaned)
def standardize(text: str, lookup: Dict[str, str] | None = None) -> str:
mapping = lookup or SLANG_LOOKUP
return " ".join(mapping.get(word, word) for word in text.split())
def _get_wordnet_pos(treebank_tag: str) -> str:
nltk = _import_nltk()
wordnet = nltk.corpus.wordnet
if treebank_tag.startswith("J"):
return wordnet.ADJ
if treebank_tag.startswith("V"):
return wordnet.VERB
if treebank_tag.startswith("N"):
return wordnet.NOUN
if treebank_tag.startswith("R"):
return wordnet.ADV
return wordnet.NOUN
def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]:
"""Run the full cleaning pipeline and return intermediate + final outputs.
Returns a dictionary so a UI can display each stage if desired.
"""
if not isinstance(raw_text, str):
raise TypeError("raw_text must be a string")
nltk = _import_nltk()
pos_tag = nltk.pos_tag
word_tokenize = nltk.word_tokenize
stopwords = nltk.corpus.stopwords
WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer
ensure_nltk_resources(quiet=True)
text = raw_text.lower()
text = remove_punctuation(text)
text = remove_alphanum_words(text)
text = remove_special_chars(text)
noise_removed = remove_short_words(text, min_len=min_len)
standardized = standardize(noise_removed)
tokens = word_tokenize(standardized)
stop_words = set(stopwords.words("english"))
tokens_no_stopwords = [token for token in tokens if token not in stop_words]
lem = WordNetLemmatizer()
pos_tags = pos_tag(tokens_no_stopwords)
lemmas = [
lem.lemmatize(token, _get_wordnet_pos(tag))
for token, tag in pos_tags
]
clean_text = " ".join(lemmas)
return {
"raw_text": raw_text,
"noise_removed": noise_removed,
"standardized": standardized,
"tokens": tokens,
"tokens_no_stopwords": tokens_no_stopwords,
"lemmas": lemmas,
"clean_text": clean_text,
}
def get_clean_text(raw_text: str, min_len: int = 3) -> str:
"""Small helper for app code that only needs the final cleaned text."""
return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]