first commit
This commit is contained in:
67
CLAUDE.md
Normal file
67
CLAUDE.md
Normal file
@@ -0,0 +1,67 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
Juicepyter is a Pokémon card generator pipeline that takes a natural language description, cleans it, extracts structured JSON metadata, and generates a card image using a LoRA-finetuned Stable Diffusion model. A Streamlit UI (`app.py`) ties it all together.
|
||||
|
||||
## Architecture — Three-Stage Pipeline
|
||||
|
||||
The pipeline (`prompt_to_card_pipeline.py`) orchestrates three stages:
|
||||
|
||||
1. **Text cleaning** (`text-cleaner/text_cleaning_pipeline.py`): NLTK-based pipeline — lowercasing, punctuation/slang removal, stopword filtering, POS-aware lemmatization. Entry point: `get_clean_text(raw_text) -> str`.
|
||||
|
||||
2. **Keyword extraction + JSON inference** (`clean-text-to-keywords/`): spaCy + YAKE keyword extraction (`keyword_extractor.py`) → rule-based JSON inference (`json_inference.py`) that populates a TCG-style card template. CLI: `infer_json_usage.py`. No LLM calls — deterministic and rule-based.
|
||||
|
||||
3. **Card image generation** (`card_generator_adapter.py`): Loads `runwayml/stable-diffusion-v1-5` with a LoRA adapter (PEFT) from `pokemon_card_lora/`, converts metadata to a SD prompt via `metadata_to_conditioning()`, runs inference. The generator module is pluggable via `--generator-module`.
|
||||
|
||||
`fetch_card.py` is a standalone data collection script that downloads real Pokémon TCG card images with embedded metadata using the TCGdex SDK.
|
||||
|
||||
## Commands
|
||||
|
||||
### Run the Streamlit app
|
||||
```bash
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
### Run the full pipeline CLI
|
||||
```bash
|
||||
python prompt_to_card_pipeline.py "description text" \
|
||||
--text-cleaner-path text-cleaner/text_cleaning_pipeline.py \
|
||||
--infer-script-path clean-text-to-keywords/infer_json_usage.py \
|
||||
--checkpoint pokemon_card_lora \
|
||||
--template clean-text-to-keywords/json_template_example.json \
|
||||
--generator-module card_generator_adapter.py \
|
||||
--device cpu \
|
||||
--save-path generated_card.png \
|
||||
--print-json
|
||||
```
|
||||
|
||||
### Run keyword extraction + JSON inference only
|
||||
```bash
|
||||
cd clean-text-to-keywords
|
||||
python infer_json_usage.py --template json_template_example.json "your pokemon description"
|
||||
```
|
||||
|
||||
### Tests
|
||||
```bash
|
||||
cd clean-text-to-keywords
|
||||
python -m unittest -q
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
- **text-cleaner**: `nltk` (punkt, stopwords, wordnet, averaged_perceptron_tagger)
|
||||
- **clean-text-to-keywords**: `spacy>=3.7.0`, `yake>=0.4.2`, spaCy model `en_core_web_sm`
|
||||
- **card generation**: `diffusers`, `torch`, `peft`, `transformers`, `accelerate`, `safetensors`
|
||||
- **app**: `streamlit`, `Pillow`
|
||||
- **fetch_card**: `tcgdexsdk`, `Pillow`
|
||||
|
||||
Python 3.13 or lower recommended (spaCy compatibility).
|
||||
|
||||
## Key Design Decisions
|
||||
|
||||
- The generator module pattern is pluggable: any module with `build_pipeline(checkpoint_path, device)` and optionally `metadata_to_conditioning(meta)` can be swapped in via `--generator-module`.
|
||||
- The JSON inference stage preserves non-empty fields in the provided template — only empty fields get populated.
|
||||
- The LoRA base model is `runwayml/stable-diffusion-v1-5` with PEFT adapter weights in `pokemon_card_lora/`.
|
||||
130
app.py
Normal file
130
app.py
Normal file
@@ -0,0 +1,130 @@
|
||||
import streamlit as st
|
||||
import subprocess
|
||||
import sys
|
||||
import shlex
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
|
||||
APP_DIR = Path(__file__).resolve().parent
|
||||
PIPELINE_SCRIPT = APP_DIR / "prompt_to_card_pipeline.py"
|
||||
TEXT_CLEANER_PATH = APP_DIR / "text-cleaner" / "text_cleaning_pipeline.py"
|
||||
INFER_SCRIPT_PATH = APP_DIR / "clean-text-to-keywords" / "infer_json_usage.py"
|
||||
CHECKPOINT_PATH = APP_DIR / "pokemon_card_lora" / "training_history.pt"
|
||||
TEMPLATE_PATH = APP_DIR / "clean-text-to-keywords" / "json_template_example.json"
|
||||
IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".webp", ".bmp")
|
||||
|
||||
|
||||
def _extract_image_from_stdout(stdout: str) -> Path | None:
|
||||
for line in reversed(stdout.splitlines()):
|
||||
text = line.strip().strip("\"'")
|
||||
if not text:
|
||||
continue
|
||||
|
||||
candidate = Path(text)
|
||||
if not candidate.is_absolute():
|
||||
candidate = APP_DIR / candidate
|
||||
|
||||
if candidate.suffix.lower() in IMAGE_EXTENSIONS and candidate.exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def run_prompt_pipeline(prompt_text: str) -> tuple[Path | None, str, list[str]]:
|
||||
cmd = [
|
||||
"python prompt_to_card_pipeline.py ",
|
||||
prompt_text,
|
||||
"--text-cleaner-path text-cleaner/text_cleaning_pipeline.py --infer-script-path clean-text-to-keywords/infer_json_usage.py --checkpoint pokemon_card_lora --template clean-text-to-keywords/json_template_example.json --generator-module card_generator_adapter.py --device cuda --save-path generated_card.png --print-json"
|
||||
|
||||
]
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
cwd=APP_DIR,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
full_output = (result.stdout or "") + ("\n" + result.stderr if result.stderr else "")
|
||||
|
||||
if result.returncode != 0:
|
||||
return None, full_output.strip() or "Erreur inconnue pendant le pipeline.", cmd
|
||||
|
||||
image_path = _extract_image_from_stdout(result.stdout or "")
|
||||
return image_path, full_output.strip(), cmd
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Configuration #
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
st.set_page_config(
|
||||
page_title="Générateur de Carte Pokémon",
|
||||
page_icon=Image.open(Path(__file__).with_name("pokeball.png")),
|
||||
layout="centered",
|
||||
)
|
||||
|
||||
logo_col, title_col = st.columns([1, 6], vertical_alignment="center")
|
||||
|
||||
with logo_col:
|
||||
st.image(Image.open(Path(__file__).with_name("pokeball.png")), width=72)
|
||||
|
||||
with title_col:
|
||||
st.title("Générateur de Carte Pokémon")
|
||||
st.markdown("Décrivez votre Pokémon en langage naturel et laissez la magie opérer !")
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Saisie utilisateur #
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
raw_text = st.text_area(
|
||||
label="Description de votre Pokémon",
|
||||
placeholder=(
|
||||
"Ex: My pokemon is called Pyrokar! Its a huge fire dragon with massive "
|
||||
"red wings and shoots flames from its mouth... super fast n aggressive >:("
|
||||
),
|
||||
height=180,
|
||||
)
|
||||
|
||||
st.markdown(
|
||||
"""
|
||||
<style>
|
||||
div.stButton > button {
|
||||
background-color: #d62828;
|
||||
color: white;
|
||||
border: 1px solid #b91c1c;
|
||||
}
|
||||
div.stButton > button:hover {
|
||||
background-color: #b91c1c;
|
||||
border-color: #991b1b;
|
||||
color: white;
|
||||
}
|
||||
</style>
|
||||
""",
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
|
||||
generate = st.button(" Générer la carte", use_container_width=True)
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Pipeline #
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
if generate:
|
||||
if not raw_text.strip():
|
||||
st.warning("Veuillez entrer une description avant de générer.")
|
||||
else:
|
||||
with st.spinner("Génération de la carte Pokémon..."):
|
||||
image, logs = run_prompt_pipeline(raw_text)
|
||||
|
||||
if image is not None:
|
||||
st.image(image, caption="Carte Pokémon générée", width="stretch")
|
||||
if logs:
|
||||
with st.expander("Logs pipeline"):
|
||||
st.code(logs)
|
||||
else:
|
||||
st.error("Aucune image générée détectée. Vérifiez les chemins du pipeline.")
|
||||
if logs:
|
||||
with st.expander("Logs pipeline"):
|
||||
st.code(logs)
|
||||
107
card_generator_adapter.py
Normal file
107
card_generator_adapter.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""Adapter to load the LoRA checkpoint and define conditioning logic.
|
||||
|
||||
Customize this file to match your model architecture, then use:
|
||||
--generator-module card_generator_adapter.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Mapping
|
||||
|
||||
|
||||
def build_pipeline(checkpoint_path: str, device: str):
|
||||
"""Load LoRA adapter and return a callable pipeline.
|
||||
|
||||
The pipeline must accept:
|
||||
pipeline(prompt_or_conditioning, num_inference_steps=30, guidance_scale=7.5)
|
||||
|
||||
and return an object with .images attribute.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
checkpoint_input = Path(checkpoint_path).expanduser().resolve()
|
||||
if checkpoint_input.is_dir():
|
||||
checkpoint_dir = checkpoint_input
|
||||
elif checkpoint_input.exists():
|
||||
checkpoint_dir = checkpoint_input.parent
|
||||
else:
|
||||
raise FileNotFoundError(f"Checkpoint path not found: {checkpoint_input}")
|
||||
|
||||
# Load base Stable Diffusion model + LoRA adapter (PEFT)
|
||||
try:
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
except ImportError as e:
|
||||
raise RuntimeError(
|
||||
f"diffusers and torch required. Install: pip install diffusers torch "
|
||||
f"(error: {e})"
|
||||
)
|
||||
|
||||
# Load base model
|
||||
model_id = "runwayml/stable-diffusion-v1-5"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
model_id,
|
||||
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
||||
)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
# Load LoRA weights from adapter_model.safetensors
|
||||
adapter_path = checkpoint_dir / "adapter_model.safetensors"
|
||||
if adapter_path.exists():
|
||||
try:
|
||||
pipe.load_lora_weights(str(checkpoint_dir))
|
||||
except Exception as e:
|
||||
message = str(e)
|
||||
if "PEFT backend is required" in message:
|
||||
raise RuntimeError(
|
||||
"Failed to load LoRA: PEFT backend is missing. "
|
||||
"Install required packages with: pip install peft transformers accelerate safetensors"
|
||||
) from e
|
||||
raise RuntimeError(
|
||||
f"Failed to load LoRA from {checkpoint_dir}: {e}\n"
|
||||
"Ensure adapter_config.json and adapter_model.safetensors are present."
|
||||
) from e
|
||||
else:
|
||||
raise FileNotFoundError(
|
||||
f"LoRA adapter not found at {adapter_path}. "
|
||||
f"Expected: adapter_model.safetensors in {checkpoint_dir}"
|
||||
)
|
||||
|
||||
return pipe
|
||||
|
||||
|
||||
def metadata_to_conditioning(meta: Mapping[str, Any]) -> str:
|
||||
"""Convert metadata dict to a Stable Diffusion prompt.
|
||||
|
||||
LoRA is trained on Pokemon cards, so describe it as such.
|
||||
"""
|
||||
name = str(meta.get("name", "Unknown Pokemon"))
|
||||
pokemon_type = str(meta.get("type", "normal")).capitalize()
|
||||
secondary = meta.get("secondary_type")
|
||||
|
||||
hp = str(meta.get("hp", "60"))
|
||||
|
||||
attacks = meta.get("attacks") or []
|
||||
attack_list = []
|
||||
if isinstance(attacks, list):
|
||||
for atk in attacks:
|
||||
if isinstance(atk, dict):
|
||||
attack_list.append(str(atk.get("name", "")).lower())
|
||||
elif atk:
|
||||
attack_list.append(str(atk).lower())
|
||||
|
||||
# Build a descriptive prompt for card generation
|
||||
prompt = f"Pokemon trading card of {name}, {pokemon_type}-type Pokemon"
|
||||
if secondary:
|
||||
prompt += f"/{secondary.capitalize()}"
|
||||
prompt += f", HP {hp}"
|
||||
|
||||
if attack_list:
|
||||
prompt += f", with attacks: {', '.join(attack_list[:2])}"
|
||||
|
||||
description = meta.get("description", "").strip()
|
||||
if description:
|
||||
prompt += f". {description}"
|
||||
|
||||
prompt += ". High quality illustration, official Pokemon card style."
|
||||
return prompt
|
||||
189
clean-text-to-keywords/.ipynb_checkpoints/README-checkpoint.md
Normal file
189
clean-text-to-keywords/.ipynb_checkpoints/README-checkpoint.md
Normal file
@@ -0,0 +1,189 @@
|
||||
# Pokemon Text-to-JSON Pipeline
|
||||
|
||||
This project converts free-form Pokemon description text into:
|
||||
|
||||
1. A normalized keyword list
|
||||
2. A populated Pokemon JSON object (from a blank/key-only template)
|
||||
|
||||
The pipeline is deterministic and rule-based.
|
||||
|
||||
## Architecture
|
||||
|
||||
### Stage 1: Keyword Extraction
|
||||
|
||||
File: `keyword_extractor.py`
|
||||
|
||||
Input: raw text description
|
||||
|
||||
Core logic:
|
||||
|
||||
- spaCy tokenization and POS tagging
|
||||
- POS filtering (`NOUN`, `ADJ`, `VERB`)
|
||||
- stopword and punctuation removal
|
||||
- lemma-based normalization
|
||||
- domain synonym normalization (example: `flames -> fire`)
|
||||
- optional YAKE relevance scoring
|
||||
- conservative retention policy so detail is not over-pruned
|
||||
|
||||
Output: ordered list of normalized keywords
|
||||
|
||||
### Stage 2: JSON Inference
|
||||
|
||||
File: `json_inference.py`
|
||||
|
||||
Input: keyword list + optional JSON template
|
||||
|
||||
Core logic:
|
||||
|
||||
- infer primary/secondary type
|
||||
- infer name candidate
|
||||
- infer attacks, abilities, habitat, personality
|
||||
- infer basic stats (`hp`, `attack`, `defense`, `speed`)
|
||||
- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.)
|
||||
- preserve already non-empty values in the provided template
|
||||
|
||||
Output: inferred JSON profile
|
||||
|
||||
### Stage 3: Orchestration CLI
|
||||
|
||||
File: `infer_json_usage.py`
|
||||
|
||||
This is the main entrypoint for end-to-end usage.
|
||||
|
||||
Default behavior:
|
||||
|
||||
1. prints extracted keyword list
|
||||
2. prints inferred JSON
|
||||
|
||||
## Project Structure
|
||||
|
||||
- `keyword_extractor.py`: keyword extraction engine
|
||||
- `json_inference.py`: keyword-to-JSON inference logic
|
||||
- `infer_json_usage.py`: end-to-end CLI
|
||||
- `example_usage.py`: keyword extraction only CLI
|
||||
- `json_template_example.json`: sample blank/key-only template
|
||||
- `test_keyword_extractor.py`: extraction tests
|
||||
- `test_json_inference.py`: inference tests
|
||||
- `requirements.txt`: Python dependencies
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.13 or lower is recommended for spaCy compatibility
|
||||
- pip
|
||||
|
||||
Dependencies in `requirements.txt`:
|
||||
|
||||
- `spacy>=3.7.0`
|
||||
- `yake>=0.4.2`
|
||||
|
||||
## Setup
|
||||
|
||||
1. Create and activate a virtual environment (recommended)
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
2. Install dependencies
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Install spaCy English model
|
||||
|
||||
```bash
|
||||
python -m spacy download en_core_web_sm
|
||||
```
|
||||
|
||||
## How To Run
|
||||
|
||||
### A) Extract keywords only
|
||||
|
||||
```bash
|
||||
python example_usage.py "furret long slender agile creature with soft fur"
|
||||
```
|
||||
|
||||
Output: JSON list of keywords.
|
||||
|
||||
### B) End-to-end: text -> keywords -> JSON
|
||||
|
||||
```bash
|
||||
python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur"
|
||||
```
|
||||
|
||||
Output order:
|
||||
|
||||
1. keyword list
|
||||
2. inferred JSON
|
||||
|
||||
### C) End-to-end but JSON only
|
||||
|
||||
```bash
|
||||
python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur"
|
||||
```
|
||||
|
||||
### D) Start from keywords directly
|
||||
|
||||
```bash
|
||||
python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance
|
||||
```
|
||||
|
||||
Tip: If you pass `--keywords`, text extraction is skipped.
|
||||
|
||||
## Template Behavior
|
||||
|
||||
If `--template` is omitted, inference returns a full inferred profile object.
|
||||
|
||||
If `--template` is provided:
|
||||
|
||||
- empty fields are populated from inferred values
|
||||
- non-empty fields are preserved
|
||||
|
||||
Current sample template supports nested card-like data including:
|
||||
|
||||
- `types`
|
||||
- `attacks` with `cost`, `name`, `effect`, `damage`
|
||||
- `weaknesses` with `type`, `value`
|
||||
- `stage`, `retreat`, `legal`
|
||||
|
||||
## Tests
|
||||
|
||||
Run all tests:
|
||||
|
||||
```bash
|
||||
python -m unittest -q
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### 1) spaCy model not found
|
||||
|
||||
Error mentions `en_core_web_sm` not installed.
|
||||
|
||||
Fix:
|
||||
|
||||
```bash
|
||||
python -m spacy download en_core_web_sm
|
||||
```
|
||||
|
||||
### 2) spaCy import/runtime problems on very new Python versions
|
||||
|
||||
Use Python 3.13 or lower and reinstall requirements.
|
||||
|
||||
### 3) `--template` path errors
|
||||
|
||||
Ensure `--template` points to a valid file path, for example:
|
||||
|
||||
```bash
|
||||
--template json_template_example.json
|
||||
```
|
||||
|
||||
If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`.
|
||||
|
||||
## Design Notes
|
||||
|
||||
- deterministic and explainable (no LLM calls)
|
||||
- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py`
|
||||
- scoring and template fill rules are intentionally simple and stable for game-content generation
|
||||
@@ -0,0 +1,36 @@
|
||||
import argparse
|
||||
import json
|
||||
from typing import Sequence
|
||||
|
||||
from keyword_extractor import KeywordExtractor
|
||||
|
||||
|
||||
def _build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract normalized keywords from cleaned text.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"text",
|
||||
nargs="+",
|
||||
help="Input text to process. Pass as one quoted string or multiple words.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default="en_core_web_sm",
|
||||
help="spaCy model name (default: en_core_web_sm).",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: Sequence[str] | None = None) -> None:
|
||||
parser = _build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
text = " ".join(args.text)
|
||||
extractor = KeywordExtractor.from_default_model(model_name=args.model)
|
||||
keywords = extractor.extract(text)
|
||||
print(json.dumps(keywords))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,137 @@
|
||||
"""Rule-based keyword extraction and normalization for Pokemon card generation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple
|
||||
|
||||
# Canonical concept -> synonym list
|
||||
from typing import Dict, List
|
||||
|
||||
DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = {
|
||||
"normal": ["basic", "common", "regular", "plain"],
|
||||
"fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"],
|
||||
"water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"],
|
||||
"grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"],
|
||||
"flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"],
|
||||
"fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"],
|
||||
"poison": ["toxic", "venom", "acid", "poisonous", "toxin"],
|
||||
"electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"],
|
||||
"ground": ["earth", "soil", "sand", "mud", "quake", "dust"],
|
||||
"rock": ["stone", "boulder", "crystal", "rocky", "pebble"],
|
||||
"psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"],
|
||||
"ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"],
|
||||
"bug": ["insect", "ant", "beetle", "spider", "crawler"],
|
||||
"ghost": ["spirit", "phantom", "haunt", "shadow", "specter"],
|
||||
"steel": ["metal", "iron", "armor", "blade", "alloy"],
|
||||
"dragon": ["drake", "wyrm", "serpent", "legendary"],
|
||||
"dark": ["shadow", "evil", "night", "doom", "darkness"],
|
||||
"fairy": ["magic", "magical", "sparkle", "light", "charm"],
|
||||
}
|
||||
|
||||
DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB")
|
||||
|
||||
|
||||
def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]:
|
||||
"""Build synonym -> canonical mapping for O(1) normalization lookup."""
|
||||
inverse: Dict[str, str] = {}
|
||||
for canonical, synonyms in normalization_map.items():
|
||||
canonical_normalized = canonical.strip().lower()
|
||||
inverse[canonical_normalized] = canonical_normalized
|
||||
for synonym in synonyms:
|
||||
synonym_normalized = synonym.strip().lower()
|
||||
if synonym_normalized:
|
||||
inverse[synonym_normalized] = canonical_normalized
|
||||
return inverse
|
||||
|
||||
|
||||
def _deduplicate_preserve_order(items: Iterable[str]) -> List[str]:
|
||||
seen: Set[str] = set()
|
||||
output: List[str] = []
|
||||
for item in items:
|
||||
if item not in seen:
|
||||
seen.add(item)
|
||||
output.append(item)
|
||||
return output
|
||||
|
||||
|
||||
@dataclass
|
||||
class KeywordExtractor:
|
||||
"""Deterministic spaCy + rule-based keyword extraction pipeline."""
|
||||
|
||||
nlp: Any
|
||||
normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP)
|
||||
allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self._normalization_lookup = _invert_normalization_map(self.normalization_map)
|
||||
self._allowed_pos_set = set(self.allowed_pos)
|
||||
|
||||
@classmethod
|
||||
def from_default_model(
|
||||
cls,
|
||||
model_name: str = "en_core_web_sm",
|
||||
normalization_map: Optional[Mapping[str, Iterable[str]]] = None,
|
||||
allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS,
|
||||
) -> "KeywordExtractor":
|
||||
"""Initialize extractor with a spaCy English pipeline."""
|
||||
try:
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load(model_name)
|
||||
except OSError as exc:
|
||||
raise OSError(
|
||||
f"spaCy model '{model_name}' is not installed. "
|
||||
"Run: python -m spacy download en_core_web_sm"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
"spaCy could not be loaded in this Python environment. "
|
||||
"Try Python 3.13 or lower, then install spaCy and en_core_web_sm."
|
||||
) from exc
|
||||
|
||||
return cls(
|
||||
nlp=nlp,
|
||||
normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP,
|
||||
allowed_pos=allowed_pos,
|
||||
)
|
||||
|
||||
def extract(self, text: str) -> List[str]:
|
||||
"""Extract and normalize keywords from already-cleaned text."""
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
doc = self.nlp(text)
|
||||
|
||||
# Step 1: POS filtering + base normalization to lowercase lemmas/tokens.
|
||||
raw_keywords: List[str] = []
|
||||
for token in doc:
|
||||
if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set:
|
||||
continue
|
||||
|
||||
# Use lemma where possible to collapse inflections.
|
||||
base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip()
|
||||
if base:
|
||||
raw_keywords.append(base)
|
||||
|
||||
# Step 2: Deduplicate before domain normalization (as requested in README).
|
||||
deduplicated = _deduplicate_preserve_order(raw_keywords)
|
||||
|
||||
# Step 3: Map variants/synonyms to canonical concepts.
|
||||
normalized = [self._normalize_keyword(keyword) for keyword in deduplicated]
|
||||
|
||||
# Step 4: Deduplicate again, since multiple words can map to one concept.
|
||||
return _deduplicate_preserve_order(normalized)
|
||||
|
||||
def _normalize_keyword(self, keyword: str) -> str:
|
||||
keyword_lower = keyword.lower()
|
||||
return self._normalization_lookup.get(keyword_lower, keyword_lower)
|
||||
|
||||
|
||||
def extract_keywords(
|
||||
text: str,
|
||||
extractor: Optional[KeywordExtractor] = None,
|
||||
) -> List[str]:
|
||||
"""Convenience API to extract keywords with default extractor config."""
|
||||
active_extractor = extractor or KeywordExtractor.from_default_model()
|
||||
return active_extractor.extract(text)
|
||||
@@ -0,0 +1,88 @@
|
||||
import unittest
|
||||
|
||||
from keyword_extractor import KeywordExtractor
|
||||
|
||||
|
||||
class FakeToken:
|
||||
def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
|
||||
self.text = text
|
||||
self.pos_ = pos
|
||||
self.lemma_ = lemma
|
||||
self.is_stop = is_stop
|
||||
self.is_punct = not any(ch.isalnum() for ch in text)
|
||||
|
||||
|
||||
class FakeNLP:
|
||||
def __init__(self, tag_map, stopwords) -> None:
|
||||
self.tag_map = tag_map
|
||||
self.stopwords = stopwords
|
||||
|
||||
def __call__(self, text: str):
|
||||
tokens = []
|
||||
for raw in text.split():
|
||||
token_text = raw.strip()
|
||||
lowered = token_text.lower()
|
||||
tokens.append(
|
||||
FakeToken(
|
||||
text=token_text,
|
||||
pos=self.tag_map.get(lowered, "NOUN"),
|
||||
lemma=lowered,
|
||||
is_stop=lowered in self.stopwords,
|
||||
)
|
||||
)
|
||||
return tokens
|
||||
|
||||
|
||||
class KeywordExtractorTests(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls) -> None:
|
||||
tag_map = {
|
||||
"fiery": "ADJ",
|
||||
"dragon": "NOUN",
|
||||
"attack": "VERB",
|
||||
"explosive": "ADJ",
|
||||
"flames": "NOUN",
|
||||
"burning": "ADJ",
|
||||
"creature": "NOUN",
|
||||
"with": "ADP",
|
||||
"blaze": "NOUN",
|
||||
"power": "NOUN",
|
||||
"electric": "ADJ",
|
||||
"mouse": "NOUN",
|
||||
"using": "VERB",
|
||||
"thunder": "NOUN",
|
||||
"shock": "NOUN",
|
||||
"a": "DET",
|
||||
"very": "ADV",
|
||||
"strong": "ADJ",
|
||||
"and": "CCONJ",
|
||||
"dangerous": "ADJ",
|
||||
}
|
||||
|
||||
stopwords = {"a", "very", "and", "with"}
|
||||
cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
|
||||
cls.extractor = KeywordExtractor(nlp=cls.nlp)
|
||||
|
||||
def test_readme_main_example(self) -> None:
|
||||
text = "fiery dragon attack explosive flames"
|
||||
result = self.extractor.extract(text)
|
||||
self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])
|
||||
|
||||
def test_synonym_normalization(self) -> None:
|
||||
text = "burning creature with blaze power"
|
||||
result = self.extractor.extract(text)
|
||||
self.assertEqual(result, ["fire", "creature", "power"])
|
||||
|
||||
def test_mixed_types(self) -> None:
|
||||
text = "electric mouse using thunder shock"
|
||||
result = self.extractor.extract(text)
|
||||
self.assertEqual(result, ["electric", "mouse", "using"])
|
||||
|
||||
def test_noise_input(self) -> None:
|
||||
text = "a very very strong and dangerous creature"
|
||||
result = self.extractor.extract(text)
|
||||
self.assertEqual(result, ["strong", "dangerous", "creature"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
189
clean-text-to-keywords/README.md
Normal file
189
clean-text-to-keywords/README.md
Normal file
@@ -0,0 +1,189 @@
|
||||
# Pokemon Text-to-JSON Pipeline
|
||||
|
||||
This project converts free-form Pokemon description text into:
|
||||
|
||||
1. A normalized keyword list
|
||||
2. A populated Pokemon JSON object (from a blank/key-only template)
|
||||
|
||||
The pipeline is deterministic and rule-based.
|
||||
|
||||
## Architecture
|
||||
|
||||
### Stage 1: Keyword Extraction
|
||||
|
||||
File: `keyword_extractor.py`
|
||||
|
||||
Input: raw text description
|
||||
|
||||
Core logic:
|
||||
|
||||
- spaCy tokenization and POS tagging
|
||||
- POS filtering (`NOUN`, `ADJ`, `VERB`)
|
||||
- stopword and punctuation removal
|
||||
- lemma-based normalization
|
||||
- domain synonym normalization (example: `flames -> fire`)
|
||||
- optional YAKE relevance scoring
|
||||
- conservative retention policy so detail is not over-pruned
|
||||
|
||||
Output: ordered list of normalized keywords
|
||||
|
||||
### Stage 2: JSON Inference
|
||||
|
||||
File: `json_inference.py`
|
||||
|
||||
Input: keyword list + optional JSON template
|
||||
|
||||
Core logic:
|
||||
|
||||
- infer primary/secondary type
|
||||
- infer name candidate
|
||||
- infer attacks, abilities, habitat, personality
|
||||
- infer basic stats (`hp`, `attack`, `defense`, `speed`)
|
||||
- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.)
|
||||
- preserve already non-empty values in the provided template
|
||||
|
||||
Output: inferred JSON profile
|
||||
|
||||
### Stage 3: Orchestration CLI
|
||||
|
||||
File: `infer_json_usage.py`
|
||||
|
||||
This is the main entrypoint for end-to-end usage.
|
||||
|
||||
Default behavior:
|
||||
|
||||
1. prints extracted keyword list
|
||||
2. prints inferred JSON
|
||||
|
||||
## Project Structure
|
||||
|
||||
- `keyword_extractor.py`: keyword extraction engine
|
||||
- `json_inference.py`: keyword-to-JSON inference logic
|
||||
- `infer_json_usage.py`: end-to-end CLI
|
||||
- `example_usage.py`: keyword extraction only CLI
|
||||
- `json_template_example.json`: sample blank/key-only template
|
||||
- `test_keyword_extractor.py`: extraction tests
|
||||
- `test_json_inference.py`: inference tests
|
||||
- `requirements.txt`: Python dependencies
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.13 or lower is recommended for spaCy compatibility
|
||||
- pip
|
||||
|
||||
Dependencies in `requirements.txt`:
|
||||
|
||||
- `spacy>=3.7.0`
|
||||
- `yake>=0.4.2`
|
||||
|
||||
## Setup
|
||||
|
||||
1. Create and activate a virtual environment (recommended)
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
2. Install dependencies
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Install spaCy English model
|
||||
|
||||
```bash
|
||||
python -m spacy download en_core_web_sm
|
||||
```
|
||||
|
||||
## How To Run
|
||||
|
||||
### A) Extract keywords only
|
||||
|
||||
```bash
|
||||
python example_usage.py "furret long slender agile creature with soft fur"
|
||||
```
|
||||
|
||||
Output: JSON list of keywords.
|
||||
|
||||
### B) End-to-end: text -> keywords -> JSON
|
||||
|
||||
```bash
|
||||
python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur"
|
||||
```
|
||||
|
||||
Output order:
|
||||
|
||||
1. keyword list
|
||||
2. inferred JSON
|
||||
|
||||
### C) End-to-end but JSON only
|
||||
|
||||
```bash
|
||||
python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur"
|
||||
```
|
||||
|
||||
### D) Start from keywords directly
|
||||
|
||||
```bash
|
||||
python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance
|
||||
```
|
||||
|
||||
Tip: If you pass `--keywords`, text extraction is skipped.
|
||||
|
||||
## Template Behavior
|
||||
|
||||
If `--template` is omitted, inference returns a full inferred profile object.
|
||||
|
||||
If `--template` is provided:
|
||||
|
||||
- empty fields are populated from inferred values
|
||||
- non-empty fields are preserved
|
||||
|
||||
Current sample template supports nested card-like data including:
|
||||
|
||||
- `types`
|
||||
- `attacks` with `cost`, `name`, `effect`, `damage`
|
||||
- `weaknesses` with `type`, `value`
|
||||
- `stage`, `retreat`, `legal`
|
||||
|
||||
## Tests
|
||||
|
||||
Run all tests:
|
||||
|
||||
```bash
|
||||
python -m unittest -q
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### 1) spaCy model not found
|
||||
|
||||
Error mentions `en_core_web_sm` not installed.
|
||||
|
||||
Fix:
|
||||
|
||||
```bash
|
||||
python -m spacy download en_core_web_sm
|
||||
```
|
||||
|
||||
### 2) spaCy import/runtime problems on very new Python versions
|
||||
|
||||
Use Python 3.13 or lower and reinstall requirements.
|
||||
|
||||
### 3) `--template` path errors
|
||||
|
||||
Ensure `--template` points to a valid file path, for example:
|
||||
|
||||
```bash
|
||||
--template json_template_example.json
|
||||
```
|
||||
|
||||
If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`.
|
||||
|
||||
## Design Notes
|
||||
|
||||
- deterministic and explainable (no LLM calls)
|
||||
- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py`
|
||||
- scoring and template fill rules are intentionally simple and stable for game-content generation
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
36
clean-text-to-keywords/example_usage.py
Normal file
36
clean-text-to-keywords/example_usage.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import argparse
|
||||
import json
|
||||
from typing import Sequence
|
||||
|
||||
from keyword_extractor import KeywordExtractor
|
||||
|
||||
|
||||
def _build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract normalized keywords from cleaned text.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"text",
|
||||
nargs="+",
|
||||
help="Input text to process. Pass as one quoted string or multiple words.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default="en_core_web_sm",
|
||||
help="spaCy model name (default: en_core_web_sm).",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: Sequence[str] | None = None) -> None:
|
||||
parser = _build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
text = " ".join(args.text)
|
||||
extractor = KeywordExtractor.from_default_model(model_name=args.model)
|
||||
keywords = extractor.extract(text)
|
||||
print(json.dumps(keywords))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
111
clean-text-to-keywords/infer_json_usage.py
Normal file
111
clean-text-to-keywords/infer_json_usage.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Sequence
|
||||
|
||||
from keyword_extractor import KeywordExtractor
|
||||
from json_inference import fill_template_from_keywords
|
||||
|
||||
|
||||
def _build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract keywords and infer values into a JSON template.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"text",
|
||||
nargs="*",
|
||||
help="Input description text.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--template",
|
||||
default="",
|
||||
help="Path to JSON template file with keys only. If omitted, full inferred JSON is returned.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default="en_core_web_sm",
|
||||
help="spaCy model name (default: en_core_web_sm).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--keywords",
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="Provide keywords directly instead of raw text.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json-only",
|
||||
action="store_true",
|
||||
help="Print only inferred JSON (skip keyword list output).",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def _load_template(path: str):
|
||||
if not path:
|
||||
return {}
|
||||
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"Template file not found: {path}")
|
||||
|
||||
with open(path, "r", encoding="utf-8") as file_handle:
|
||||
raw = file_handle.read().strip()
|
||||
if not raw:
|
||||
return {}
|
||||
return json.loads(raw)
|
||||
|
||||
|
||||
def _parse_keywords_fragment(raw: str):
|
||||
if not raw.strip():
|
||||
return []
|
||||
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
if isinstance(parsed, list):
|
||||
return [str(item).strip().lower() for item in parsed if str(item).strip()]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
tokens = re.findall(r"[a-zA-Z0-9_-]+", raw.lower())
|
||||
return [token for token in tokens if token]
|
||||
|
||||
|
||||
def _extract_keywords(args):
|
||||
if args.keywords:
|
||||
return [word.strip().lower() for word in args.keywords if word.strip()]
|
||||
|
||||
if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["):
|
||||
raw = " ".join([args.template] + args.text)
|
||||
return _parse_keywords_fragment(raw)
|
||||
|
||||
if not args.text:
|
||||
raise ValueError("Provide input text or use --keywords.")
|
||||
|
||||
text = " ".join(args.text)
|
||||
extractor = KeywordExtractor.from_default_model(model_name=args.model)
|
||||
return extractor.extract(text)
|
||||
|
||||
|
||||
def main(argv: Sequence[str] | None = None) -> None:
|
||||
parser = _build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
keywords = _extract_keywords(args)
|
||||
|
||||
template_path = args.template
|
||||
if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["):
|
||||
template_path = ""
|
||||
|
||||
template = _load_template(template_path)
|
||||
inferred_json = fill_template_from_keywords(template, keywords)
|
||||
|
||||
if args.json_only:
|
||||
print(json.dumps(inferred_json, indent=2))
|
||||
return
|
||||
|
||||
print(json.dumps(keywords))
|
||||
print(json.dumps(inferred_json, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
398
clean-text-to-keywords/json_inference.py
Normal file
398
clean-text-to-keywords/json_inference.py
Normal file
@@ -0,0 +1,398 @@
|
||||
"""Infer Pokemon-like JSON values from extracted keywords."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
from typing import Any, Dict, Iterable, List, Mapping, Sequence
|
||||
|
||||
POKEMON_TYPES = {
|
||||
"normal",
|
||||
"fire",
|
||||
"water",
|
||||
"grass",
|
||||
"electric",
|
||||
"ice",
|
||||
"fighting",
|
||||
"poison",
|
||||
"ground",
|
||||
"flying",
|
||||
"psychic",
|
||||
"bug",
|
||||
"rock",
|
||||
"ghost",
|
||||
"dragon",
|
||||
"dark",
|
||||
"steel",
|
||||
"fairy",
|
||||
}
|
||||
|
||||
HABITAT_KEYWORDS = {
|
||||
"forest",
|
||||
"field",
|
||||
"cave",
|
||||
"mountain",
|
||||
"river",
|
||||
"ocean",
|
||||
"sea",
|
||||
"tunnel",
|
||||
"nest",
|
||||
"sky",
|
||||
"desert",
|
||||
"swamp",
|
||||
"volcano",
|
||||
}
|
||||
|
||||
PERSONALITY_KEYWORDS = {
|
||||
"calm",
|
||||
"gentle",
|
||||
"agile",
|
||||
"playful",
|
||||
"cheerful",
|
||||
"energetic",
|
||||
"curious",
|
||||
"fierce",
|
||||
"brave",
|
||||
"loyal",
|
||||
"timid",
|
||||
"bold",
|
||||
}
|
||||
|
||||
MOVE_KEYWORDS = {
|
||||
"attack",
|
||||
"smash",
|
||||
"strike",
|
||||
"kick",
|
||||
"punch",
|
||||
"shock",
|
||||
"thunder",
|
||||
"bolt",
|
||||
"blast",
|
||||
"explosion",
|
||||
"freeze",
|
||||
"bite",
|
||||
"claw",
|
||||
"tail",
|
||||
"fight",
|
||||
}
|
||||
|
||||
ABILITY_KEYWORDS = {
|
||||
"recover",
|
||||
"endurance",
|
||||
"explore",
|
||||
"hide",
|
||||
"wander",
|
||||
"bond",
|
||||
"speed",
|
||||
"power",
|
||||
"energy",
|
||||
"flexible",
|
||||
}
|
||||
|
||||
STAT_HINTS = {
|
||||
"hp": {"endurance", "recover", "energy", "stamina", "healthy", "vital"},
|
||||
"attack": {"attack", "smash", "strike", "punch", "kick", "claw", "fight", "power"},
|
||||
"defense": {"armor", "shield", "tough", "hard", "resist", "solid"},
|
||||
"speed": {"speed", "swift", "agile", "quick", "fast", "dash"},
|
||||
}
|
||||
|
||||
KEY_ALIASES = {
|
||||
"name": {"name", "pokemon_name"},
|
||||
"type": {"type", "primary_type", "pokemon_type"},
|
||||
"secondary_type": {"secondary_type", "type2", "secondary"},
|
||||
"attacks": {"attacks", "moves", "skills", "offense"},
|
||||
"abilities": {"abilities", "traits", "passives", "special_abilities"},
|
||||
"habitat": {"habitat", "environment", "region"},
|
||||
"personality": {"personality", "temperament", "nature"},
|
||||
"description": {"description", "flavor_text", "summary", "lore"},
|
||||
"keywords": {"keywords", "tags"},
|
||||
"hp": {"hp", "health", "health_points"},
|
||||
"attack": {"attack", "atk"},
|
||||
"defense": {"defense", "def"},
|
||||
"speed": {"speed", "spd"},
|
||||
}
|
||||
|
||||
GENERIC_NAME_BLACKLIST = {
|
||||
"black",
|
||||
"white",
|
||||
"yellow",
|
||||
"red",
|
||||
"blue",
|
||||
"green",
|
||||
"purple",
|
||||
"orange",
|
||||
"pink",
|
||||
"gray",
|
||||
"grey",
|
||||
"brown",
|
||||
"fur",
|
||||
"body",
|
||||
"tail",
|
||||
"claw",
|
||||
"storm",
|
||||
"cloud",
|
||||
"enemy",
|
||||
"super",
|
||||
"scary",
|
||||
"giant",
|
||||
"speed",
|
||||
}
|
||||
|
||||
TYPE_WEAKNESS = {
|
||||
"normal": "fighting",
|
||||
"fire": "water",
|
||||
"water": "electric",
|
||||
"grass": "fire",
|
||||
"electric": "ground",
|
||||
"ice": "fire",
|
||||
"fighting": "psychic",
|
||||
"poison": "ground",
|
||||
"ground": "water",
|
||||
"flying": "electric",
|
||||
"psychic": "dark",
|
||||
"bug": "fire",
|
||||
"rock": "water",
|
||||
"ghost": "dark",
|
||||
"dragon": "fairy",
|
||||
"dark": "fighting",
|
||||
"steel": "fire",
|
||||
"fairy": "steel",
|
||||
}
|
||||
|
||||
|
||||
def _title_case(value: str) -> str:
|
||||
return " ".join(part.capitalize() for part in value.split())
|
||||
|
||||
|
||||
def _is_empty_value(value: Any) -> bool:
|
||||
if value is None:
|
||||
return True
|
||||
if isinstance(value, str):
|
||||
return value.strip() == ""
|
||||
if isinstance(value, (list, dict, tuple, set)):
|
||||
return len(value) == 0
|
||||
return False
|
||||
|
||||
|
||||
def _canonical_key(key: str) -> str:
|
||||
lowered = key.lower().strip()
|
||||
for canonical, aliases in KEY_ALIASES.items():
|
||||
if lowered in aliases:
|
||||
return canonical
|
||||
return lowered
|
||||
|
||||
|
||||
def _pick_name(keywords: Sequence[str]) -> str:
|
||||
for keyword in keywords:
|
||||
if keyword in POKEMON_TYPES:
|
||||
continue
|
||||
if keyword in HABITAT_KEYWORDS:
|
||||
continue
|
||||
if keyword in MOVE_KEYWORDS:
|
||||
continue
|
||||
if keyword in ABILITY_KEYWORDS:
|
||||
continue
|
||||
if keyword in PERSONALITY_KEYWORDS:
|
||||
continue
|
||||
if keyword in GENERIC_NAME_BLACKLIST:
|
||||
continue
|
||||
if len(keyword) < 4:
|
||||
continue
|
||||
return _title_case(keyword)
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def _pick_types(keywords: Sequence[str]) -> List[str]:
|
||||
types: List[str] = []
|
||||
for keyword in keywords:
|
||||
if keyword in POKEMON_TYPES and keyword not in types:
|
||||
types.append(keyword)
|
||||
if len(types) >= 2:
|
||||
break
|
||||
if not types:
|
||||
types.append("normal")
|
||||
return types
|
||||
|
||||
|
||||
def _pick_habitat(keywords: Sequence[str]) -> str:
|
||||
habitats = [word for word in keywords if word in HABITAT_KEYWORDS]
|
||||
if not habitats:
|
||||
return "unknown"
|
||||
return habitats[0]
|
||||
|
||||
|
||||
def _pick_personality(keywords: Sequence[str]) -> List[str]:
|
||||
result: List[str] = []
|
||||
for keyword in keywords:
|
||||
if keyword in PERSONALITY_KEYWORDS and keyword not in result:
|
||||
result.append(keyword)
|
||||
return result[:3]
|
||||
|
||||
|
||||
def _pick_attacks(keywords: Sequence[str]) -> List[str]:
|
||||
attacks: List[str] = []
|
||||
for keyword in keywords:
|
||||
if keyword in MOVE_KEYWORDS and keyword not in attacks:
|
||||
attacks.append(keyword)
|
||||
return attacks[:4]
|
||||
|
||||
|
||||
def _pick_abilities(keywords: Sequence[str]) -> List[str]:
|
||||
abilities: List[str] = []
|
||||
for keyword in keywords:
|
||||
if keyword in ABILITY_KEYWORDS and keyword not in abilities:
|
||||
abilities.append(keyword)
|
||||
return abilities[:4]
|
||||
|
||||
|
||||
def _score_stat(base: int, keywords: Sequence[str], hints: Iterable[str]) -> int:
|
||||
hint_set = set(hints)
|
||||
matches = sum(1 for keyword in keywords if keyword in hint_set)
|
||||
# Each match adds 10 points; keep stats in [40, 160].
|
||||
return max(40, min(160, base + (matches * 10)))
|
||||
|
||||
|
||||
def _build_description(name: str, primary_type: str, attacks: Sequence[str], abilities: Sequence[str], habitat: str) -> str:
|
||||
attack_text = ", ".join(attacks) if attacks else "basic combat"
|
||||
ability_text = ", ".join(abilities) if abilities else "balanced adaptation"
|
||||
return (
|
||||
f"{name} is a {primary_type}-type Pokemon often found in {habitat}. "
|
||||
f"It commonly uses {attack_text} and shows abilities like {ability_text}."
|
||||
)
|
||||
|
||||
|
||||
def _retreat_cost_from_speed(speed: int) -> int:
|
||||
if speed >= 120:
|
||||
return 0
|
||||
if speed >= 90:
|
||||
return 1
|
||||
if speed >= 70:
|
||||
return 2
|
||||
return 3
|
||||
|
||||
|
||||
def _attack_damage_from_attack_stat(attack_stat: int, index: int) -> int:
|
||||
# Keep card damage in simple 10-step increments.
|
||||
base = 30 + max(0, attack_stat - 70) // 2
|
||||
adjusted = base + (index * 10)
|
||||
return max(10, min(160, (adjusted // 10) * 10))
|
||||
|
||||
|
||||
def _energy_name_for_type(pokemon_type: str) -> str:
|
||||
if pokemon_type == "normal":
|
||||
return "Colorless"
|
||||
return _title_case(pokemon_type)
|
||||
|
||||
|
||||
def _fill_tcg_like_template(output: Dict[str, Any], inferred: Mapping[str, Any]) -> None:
|
||||
if "name" in output and _is_empty_value(output.get("name")):
|
||||
output["name"] = inferred["name"]
|
||||
|
||||
if "description" in output and _is_empty_value(output.get("description")):
|
||||
output["description"] = inferred["description"]
|
||||
|
||||
if "hp" in output and _is_empty_value(output.get("hp")):
|
||||
hp_value = inferred["hp"]
|
||||
output["hp"] = str(hp_value) if isinstance(output.get("hp"), str) else hp_value
|
||||
|
||||
if "types" in output and isinstance(output.get("types"), list):
|
||||
types_value = output["types"]
|
||||
if len(types_value) == 0 or all(_is_empty_value(item) for item in types_value):
|
||||
inferred_types = [inferred["type"]]
|
||||
if inferred.get("secondary_type"):
|
||||
inferred_types.append(inferred["secondary_type"])
|
||||
output["types"] = inferred_types
|
||||
|
||||
if "stage" in output and _is_empty_value(output.get("stage")):
|
||||
output["stage"] = "Basic"
|
||||
|
||||
if "retreat" in output and (output.get("retreat") in (None, 0, "")):
|
||||
output["retreat"] = _retreat_cost_from_speed(int(inferred["speed"]))
|
||||
|
||||
if "weaknesses" in output and isinstance(output.get("weaknesses"), list):
|
||||
weaknesses = output["weaknesses"]
|
||||
if weaknesses:
|
||||
weakness_type = TYPE_WEAKNESS.get(inferred["type"], "fighting")
|
||||
first = weaknesses[0]
|
||||
if isinstance(first, dict):
|
||||
if _is_empty_value(first.get("type")):
|
||||
first["type"] = weakness_type
|
||||
if _is_empty_value(first.get("value")):
|
||||
first["value"] = "x2"
|
||||
|
||||
if "attacks" in output and isinstance(output.get("attacks"), list):
|
||||
attack_entries = output["attacks"]
|
||||
inferred_attacks = inferred["attacks"]
|
||||
inferred_type = inferred["type"]
|
||||
for idx, attack_entry in enumerate(attack_entries):
|
||||
if not isinstance(attack_entry, dict):
|
||||
continue
|
||||
|
||||
attack_name = inferred_attacks[idx] if idx < len(inferred_attacks) else "tackle"
|
||||
attack_title = _title_case(attack_name)
|
||||
if _is_empty_value(attack_entry.get("name")):
|
||||
attack_entry["name"] = attack_title
|
||||
if _is_empty_value(attack_entry.get("effect")):
|
||||
attack_entry["effect"] = f"Deals damage with {attack_name}."
|
||||
|
||||
if "damage" in attack_entry and (attack_entry.get("damage") in (None, 0, "")):
|
||||
attack_entry["damage"] = _attack_damage_from_attack_stat(int(inferred["attack"]), idx)
|
||||
|
||||
if "cost" in attack_entry and isinstance(attack_entry.get("cost"), list):
|
||||
current_cost = attack_entry["cost"]
|
||||
if len(current_cost) == 0 or all(_is_empty_value(item) for item in current_cost):
|
||||
attack_entry["cost"] = [_energy_name_for_type(inferred_type)]
|
||||
|
||||
|
||||
def infer_profile_from_keywords(keywords: Sequence[str]) -> Dict[str, Any]:
|
||||
cleaned = [k.strip().lower() for k in keywords if k and k.strip()]
|
||||
|
||||
name = _pick_name(cleaned)
|
||||
types = _pick_types(cleaned)
|
||||
attacks = _pick_attacks(cleaned)
|
||||
abilities = _pick_abilities(cleaned)
|
||||
habitat = _pick_habitat(cleaned)
|
||||
personality = _pick_personality(cleaned)
|
||||
|
||||
hp = _score_stat(70, cleaned, STAT_HINTS["hp"])
|
||||
attack = _score_stat(70, cleaned, STAT_HINTS["attack"])
|
||||
defense = _score_stat(70, cleaned, STAT_HINTS["defense"])
|
||||
speed = _score_stat(70, cleaned, STAT_HINTS["speed"])
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"type": types[0],
|
||||
"secondary_type": types[1] if len(types) > 1 else None,
|
||||
"attacks": attacks,
|
||||
"abilities": abilities,
|
||||
"habitat": habitat,
|
||||
"personality": personality,
|
||||
"hp": hp,
|
||||
"attack": attack,
|
||||
"defense": defense,
|
||||
"speed": speed,
|
||||
"keywords": cleaned,
|
||||
"description": _build_description(name, types[0], attacks, abilities, habitat),
|
||||
}
|
||||
|
||||
|
||||
def fill_template_from_keywords(template: Mapping[str, Any], keywords: Sequence[str]) -> Dict[str, Any]:
|
||||
"""Fill a key-only template by inferring values from keywords.
|
||||
|
||||
Existing non-empty values in template are preserved.
|
||||
"""
|
||||
inferred = infer_profile_from_keywords(keywords)
|
||||
output: Dict[str, Any] = deepcopy(dict(template))
|
||||
|
||||
if not output:
|
||||
return inferred
|
||||
|
||||
_fill_tcg_like_template(output, inferred)
|
||||
|
||||
for key, current_value in output.items():
|
||||
canonical = _canonical_key(key)
|
||||
if canonical not in inferred:
|
||||
continue
|
||||
if _is_empty_value(current_value):
|
||||
output[key] = inferred[canonical]
|
||||
|
||||
return output
|
||||
35
clean-text-to-keywords/json_template_example.json
Normal file
35
clean-text-to-keywords/json_template_example.json
Normal file
@@ -0,0 +1,35 @@
|
||||
{
|
||||
"category": "Pokemon",
|
||||
"name": "",
|
||||
"rarity": "",
|
||||
"hp": "",
|
||||
"types": [""],
|
||||
"evolveFrom": "",
|
||||
"description": "",
|
||||
"stage": "",
|
||||
"attacks": [
|
||||
{
|
||||
"cost": [""],
|
||||
"name": "",
|
||||
"effect": ""
|
||||
},
|
||||
{
|
||||
"cost": [""],
|
||||
"name": "",
|
||||
"effect": "",
|
||||
"damage": 0
|
||||
}
|
||||
],
|
||||
"weaknesses": [
|
||||
{
|
||||
"type": "",
|
||||
"value": ""
|
||||
}
|
||||
],
|
||||
"retreat": 0,
|
||||
"regulationMark": "",
|
||||
"legal": {
|
||||
"standard": true,
|
||||
"expanded": true
|
||||
}
|
||||
}
|
||||
248
clean-text-to-keywords/keyword_extractor.py
Normal file
248
clean-text-to-keywords/keyword_extractor.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""Rule-based keyword extraction and normalization for Pokemon card generation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple
|
||||
|
||||
DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = {
|
||||
"normal": ["basic", "common", "regular", "plain", "normaltype"],
|
||||
"fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"],
|
||||
"water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"],
|
||||
"grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"],
|
||||
"flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"],
|
||||
"fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"],
|
||||
"poison": ["toxic", "venom", "acid", "poisonous", "toxin"],
|
||||
"electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"],
|
||||
"ground": ["earth", "soil", "sand", "mud", "quake", "dust"],
|
||||
"rock": ["stone", "boulder", "crystal", "rocky", "pebble"],
|
||||
"psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"],
|
||||
"ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"],
|
||||
"bug": ["insect", "ant", "beetle", "spider", "crawler"],
|
||||
"ghost": ["spirit", "phantom", "haunt", "shadow", "specter"],
|
||||
"steel": ["metal", "iron", "armor", "blade", "alloy"],
|
||||
"dragon": ["drake", "wyrm", "serpent", "legendary"],
|
||||
"dark": ["shadow", "evil", "night", "doom", "darkness"],
|
||||
"fairy": ["magic", "magical", "sparkle", "light", "charm"],
|
||||
"explosion": ["explosive", "explode", "blast"],
|
||||
}
|
||||
|
||||
DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB")
|
||||
DEFAULT_IGNORED_KEYWORDS: Set[str] = {"preevolution", "pokmon"}
|
||||
DEFAULT_POS_WEIGHTS: Dict[str, float] = {
|
||||
"NOUN": 3.0,
|
||||
"ADJ": 2.0,
|
||||
"VERB": 1.0,
|
||||
}
|
||||
DEFAULT_KEEP_RATIO = 0.8
|
||||
DEFAULT_MIN_KEYWORDS = 12
|
||||
DEFAULT_MAX_KEYWORDS = 30
|
||||
|
||||
|
||||
def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]:
|
||||
"""Build synonym -> canonical mapping for O(1) normalization lookup."""
|
||||
inverse: Dict[str, str] = {}
|
||||
for canonical, synonyms in normalization_map.items():
|
||||
canonical_normalized = canonical.strip().lower()
|
||||
inverse[canonical_normalized] = canonical_normalized
|
||||
for synonym in synonyms:
|
||||
synonym_normalized = synonym.strip().lower()
|
||||
if synonym_normalized:
|
||||
inverse[synonym_normalized] = canonical_normalized
|
||||
return inverse
|
||||
|
||||
|
||||
def _tokenize_keyword_phrase(value: str) -> List[str]:
|
||||
return re.findall(r"[a-z0-9]+", value.lower())
|
||||
|
||||
|
||||
@dataclass
|
||||
class KeywordExtractor:
|
||||
"""Deterministic spaCy + YAKE + rule-based normalization pipeline."""
|
||||
|
||||
nlp: Any
|
||||
normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP)
|
||||
allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS)
|
||||
ignored_keywords: Set[str] = field(default_factory=lambda: set(DEFAULT_IGNORED_KEYWORDS))
|
||||
pos_weights: Mapping[str, float] = field(default_factory=lambda: DEFAULT_POS_WEIGHTS)
|
||||
keep_ratio: float = DEFAULT_KEEP_RATIO
|
||||
min_keywords: int = DEFAULT_MIN_KEYWORDS
|
||||
max_keywords: int = DEFAULT_MAX_KEYWORDS
|
||||
use_yake: bool = True
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self._normalization_lookup = _invert_normalization_map(self.normalization_map)
|
||||
self._allowed_pos_set = set(self.allowed_pos)
|
||||
self._ignored_keywords = {keyword.lower().strip() for keyword in self.ignored_keywords}
|
||||
self._pos_weight_lookup = {k.upper(): float(v) for k, v in self.pos_weights.items()}
|
||||
|
||||
@classmethod
|
||||
def from_default_model(
|
||||
cls,
|
||||
model_name: str = "en_core_web_sm",
|
||||
normalization_map: Optional[Mapping[str, Iterable[str]]] = None,
|
||||
allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS,
|
||||
ignored_keywords: Optional[Set[str]] = None,
|
||||
pos_weights: Mapping[str, float] = DEFAULT_POS_WEIGHTS,
|
||||
keep_ratio: float = DEFAULT_KEEP_RATIO,
|
||||
min_keywords: int = DEFAULT_MIN_KEYWORDS,
|
||||
max_keywords: int = DEFAULT_MAX_KEYWORDS,
|
||||
use_yake: bool = True,
|
||||
) -> "KeywordExtractor":
|
||||
"""Initialize extractor with a spaCy English pipeline."""
|
||||
try:
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load(model_name)
|
||||
except OSError as exc:
|
||||
raise OSError(
|
||||
f"spaCy model '{model_name}' is not installed. "
|
||||
"Run: python -m spacy download en_core_web_sm"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
"spaCy could not be loaded in this Python environment. "
|
||||
"Try Python 3.13 or lower, then install spaCy and en_core_web_sm."
|
||||
) from exc
|
||||
|
||||
return cls(
|
||||
nlp=nlp,
|
||||
normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP,
|
||||
allowed_pos=allowed_pos,
|
||||
ignored_keywords=ignored_keywords or set(DEFAULT_IGNORED_KEYWORDS),
|
||||
pos_weights=pos_weights,
|
||||
keep_ratio=keep_ratio,
|
||||
min_keywords=min_keywords,
|
||||
max_keywords=max_keywords,
|
||||
use_yake=use_yake,
|
||||
)
|
||||
|
||||
def extract(self, text: str) -> List[str]:
|
||||
"""Extract, normalize and rank keywords from already-cleaned text."""
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
doc = self.nlp(text)
|
||||
|
||||
# Step 1: POS filtering + lowercase lemma/token extraction.
|
||||
raw_keywords: List[Tuple[str, str]] = []
|
||||
for token in doc:
|
||||
if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set:
|
||||
continue
|
||||
|
||||
base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip()
|
||||
if base and base not in self._ignored_keywords:
|
||||
raw_keywords.append((base, token.pos_))
|
||||
|
||||
# Step 2: Deduplicate before domain normalization.
|
||||
deduplicated: List[Tuple[str, str]] = []
|
||||
seen_raw: Set[str] = set()
|
||||
for keyword, pos in raw_keywords:
|
||||
if keyword in seen_raw:
|
||||
continue
|
||||
seen_raw.add(keyword)
|
||||
deduplicated.append((keyword, pos))
|
||||
|
||||
# Step 3: Normalize and deduplicate canonical forms.
|
||||
unique_entries: List[Tuple[str, str, str, int]] = []
|
||||
seen_normalized: Set[str] = set()
|
||||
for index, (original_keyword, pos) in enumerate(deduplicated):
|
||||
normalized_keyword = self._normalize_keyword(original_keyword)
|
||||
if normalized_keyword in seen_normalized:
|
||||
continue
|
||||
seen_normalized.add(normalized_keyword)
|
||||
unique_entries.append((original_keyword, normalized_keyword, pos, index))
|
||||
|
||||
if not unique_entries:
|
||||
return []
|
||||
|
||||
if not self.use_yake:
|
||||
return [normalized_keyword for _, normalized_keyword, _, _ in unique_entries]
|
||||
|
||||
# Step 4: YAKE scoring + conservative selection to preserve detail.
|
||||
yake_scores = self._extract_yake_scores(text)
|
||||
if not yake_scores:
|
||||
return [normalized_keyword for _, normalized_keyword, _, _ in unique_entries]
|
||||
|
||||
ranked: List[Tuple[float, int, str]] = []
|
||||
for original_keyword, normalized_keyword, pos, index in unique_entries:
|
||||
score_candidates: List[float] = []
|
||||
if original_keyword in yake_scores:
|
||||
score_candidates.append(yake_scores[original_keyword])
|
||||
if normalized_keyword in yake_scores:
|
||||
score_candidates.append(yake_scores[normalized_keyword])
|
||||
|
||||
# Missing score is treated as moderately relevant to avoid over-pruning.
|
||||
yake_penalty = min(score_candidates) if score_candidates else 0.45
|
||||
pos_weight = self._pos_weight_lookup.get(pos.upper(), 1.0)
|
||||
combined_score = (1.0 - yake_penalty) * pos_weight
|
||||
ranked.append((combined_score, index, normalized_keyword))
|
||||
|
||||
target_count = self._compute_target_count(len(ranked))
|
||||
ranked.sort(key=lambda item: (-item[0], item[1]))
|
||||
selected = ranked[:target_count]
|
||||
selected.sort(key=lambda item: item[1])
|
||||
|
||||
return [keyword for _, _, keyword in selected]
|
||||
|
||||
def _compute_target_count(self, total_keywords: int) -> int:
|
||||
if total_keywords <= 0:
|
||||
return 0
|
||||
|
||||
target = max(self.min_keywords, math.ceil(total_keywords * self.keep_ratio))
|
||||
if self.max_keywords > 0:
|
||||
target = min(target, self.max_keywords)
|
||||
return min(target, total_keywords)
|
||||
|
||||
def _extract_yake_scores(self, text: str) -> Dict[str, float]:
|
||||
try:
|
||||
import yake
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
text_token_count = len(text.split())
|
||||
top_n = max(20, min(80, text_token_count * 2))
|
||||
|
||||
try:
|
||||
extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, top=top_n)
|
||||
phrase_scores = extractor.extract_keywords(text)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
token_scores: Dict[str, float] = {}
|
||||
for phrase, score in phrase_scores:
|
||||
for token in _tokenize_keyword_phrase(phrase):
|
||||
existing = token_scores.get(token)
|
||||
if existing is None or score < existing:
|
||||
token_scores[token] = score
|
||||
|
||||
if not token_scores:
|
||||
return {}
|
||||
|
||||
values = list(token_scores.values())
|
||||
min_score = min(values)
|
||||
max_score = max(values)
|
||||
|
||||
if math.isclose(min_score, max_score):
|
||||
return {token: 0.5 for token in token_scores}
|
||||
|
||||
# Normalize so 0.0=most important and 1.0=least important.
|
||||
return {
|
||||
token: (score - min_score) / (max_score - min_score)
|
||||
for token, score in token_scores.items()
|
||||
}
|
||||
|
||||
def _normalize_keyword(self, keyword: str) -> str:
|
||||
keyword_lower = keyword.lower()
|
||||
return self._normalization_lookup.get(keyword_lower, keyword_lower)
|
||||
|
||||
|
||||
def extract_keywords(
|
||||
text: str,
|
||||
extractor: Optional[KeywordExtractor] = None,
|
||||
) -> List[str]:
|
||||
"""Convenience API to extract keywords with default extractor config."""
|
||||
active_extractor = extractor or KeywordExtractor.from_default_model()
|
||||
return active_extractor.extract(text)
|
||||
2
clean-text-to-keywords/requirements.txt
Normal file
2
clean-text-to-keywords/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
spacy>=3.7.0
|
||||
yake>=0.4.2
|
||||
143
clean-text-to-keywords/test_json_inference.py
Normal file
143
clean-text-to-keywords/test_json_inference.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import unittest
|
||||
|
||||
from json_inference import fill_template_from_keywords, infer_profile_from_keywords
|
||||
|
||||
|
||||
class JsonInferenceTests(unittest.TestCase):
|
||||
def test_profile_inference_basics(self) -> None:
|
||||
keywords = [
|
||||
"zapthorn",
|
||||
"electric",
|
||||
"wolf",
|
||||
"thunder",
|
||||
"claw",
|
||||
"speed",
|
||||
"storm",
|
||||
"agile",
|
||||
"forest",
|
||||
"recover",
|
||||
"energy",
|
||||
]
|
||||
|
||||
profile = infer_profile_from_keywords(keywords)
|
||||
|
||||
self.assertEqual(profile["name"], "Zapthorn")
|
||||
self.assertEqual(profile["type"], "electric")
|
||||
self.assertIn("thunder", profile["attacks"])
|
||||
self.assertIn("claw", profile["attacks"])
|
||||
self.assertIn("recover", profile["abilities"])
|
||||
self.assertEqual(profile["habitat"], "forest")
|
||||
self.assertGreaterEqual(profile["speed"], 80)
|
||||
|
||||
def test_fill_key_only_template(self) -> None:
|
||||
template = {
|
||||
"name": "",
|
||||
"type": "",
|
||||
"secondary_type": None,
|
||||
"attacks": [],
|
||||
"abilities": [],
|
||||
"habitat": "",
|
||||
"personality": [],
|
||||
"hp": None,
|
||||
"attack": None,
|
||||
"defense": None,
|
||||
"speed": None,
|
||||
"description": "",
|
||||
"keywords": [],
|
||||
}
|
||||
|
||||
keywords = [
|
||||
"furret",
|
||||
"normal",
|
||||
"tail",
|
||||
"smash",
|
||||
"tunnel",
|
||||
"agile",
|
||||
"cheerful",
|
||||
"explore",
|
||||
"endurance",
|
||||
]
|
||||
|
||||
result = fill_template_from_keywords(template, keywords)
|
||||
|
||||
self.assertEqual(result["name"], "Furret")
|
||||
self.assertEqual(result["type"], "normal")
|
||||
self.assertIn("smash", result["attacks"])
|
||||
self.assertIn("explore", result["abilities"])
|
||||
self.assertEqual(result["habitat"], "tunnel")
|
||||
self.assertIn("cheerful", result["personality"])
|
||||
self.assertIsInstance(result["description"], str)
|
||||
self.assertGreater(len(result["description"]), 20)
|
||||
|
||||
def test_fill_tcg_style_template(self) -> None:
|
||||
template = {
|
||||
"category": "Pokemon",
|
||||
"name": "",
|
||||
"hp": "",
|
||||
"types": [""],
|
||||
"description": "",
|
||||
"stage": "",
|
||||
"attacks": [
|
||||
{"cost": [""], "name": "", "effect": ""},
|
||||
{"cost": [""], "name": "", "effect": "", "damage": 0},
|
||||
],
|
||||
"weaknesses": [{"type": "", "value": ""}],
|
||||
"retreat": 0,
|
||||
}
|
||||
|
||||
keywords = [
|
||||
"zapthorn",
|
||||
"electric",
|
||||
"thunder",
|
||||
"claw",
|
||||
"speed",
|
||||
"storm",
|
||||
"energy",
|
||||
]
|
||||
|
||||
result = fill_template_from_keywords(template, keywords)
|
||||
|
||||
self.assertEqual(result["name"], "Zapthorn")
|
||||
self.assertEqual(result["types"], ["electric"])
|
||||
self.assertEqual(result["stage"], "Basic")
|
||||
self.assertTrue(result["hp"].isdigit())
|
||||
self.assertEqual(result["weaknesses"][0]["type"], "ground")
|
||||
self.assertEqual(result["weaknesses"][0]["value"], "x2")
|
||||
self.assertEqual(result["attacks"][0]["name"], "Thunder")
|
||||
self.assertEqual(result["attacks"][1]["name"], "Claw")
|
||||
self.assertEqual(result["attacks"][0]["cost"], ["Electric"])
|
||||
self.assertGreaterEqual(result["retreat"], 0)
|
||||
|
||||
def test_name_fallback_to_unknown_for_generic_tokens(self) -> None:
|
||||
keywords = [
|
||||
"black",
|
||||
"fur",
|
||||
"giant",
|
||||
"electric",
|
||||
"claw",
|
||||
"speed",
|
||||
"storm",
|
||||
]
|
||||
|
||||
profile = infer_profile_from_keywords(keywords)
|
||||
self.assertEqual(profile["name"], "Unknown")
|
||||
|
||||
def test_preserves_existing_values(self) -> None:
|
||||
template = {
|
||||
"name": "CustomName",
|
||||
"type": "electric",
|
||||
"attacks": [],
|
||||
"description": "Already set",
|
||||
}
|
||||
keywords = ["furret", "normal", "attack"]
|
||||
|
||||
result = fill_template_from_keywords(template, keywords)
|
||||
|
||||
self.assertEqual(result["name"], "CustomName")
|
||||
self.assertEqual(result["type"], "electric")
|
||||
self.assertEqual(result["description"], "Already set")
|
||||
self.assertIn("attack", result["attacks"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
166
clean-text-to-keywords/test_keyword_extractor.py
Normal file
166
clean-text-to-keywords/test_keyword_extractor.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import unittest
|
||||
|
||||
from keyword_extractor import KeywordExtractor
|
||||
|
||||
|
||||
class FakeToken:
|
||||
def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
|
||||
self.text = text
|
||||
self.pos_ = pos
|
||||
self.lemma_ = lemma
|
||||
self.is_stop = is_stop
|
||||
self.is_punct = not any(ch.isalnum() for ch in text)
|
||||
|
||||
|
||||
class FakeNLP:
|
||||
def __init__(self, tag_map, stopwords) -> None:
|
||||
self.tag_map = tag_map
|
||||
self.stopwords = stopwords
|
||||
|
||||
def __call__(self, text: str):
|
||||
tokens = []
|
||||
for raw in text.split():
|
||||
token_text = raw.strip()
|
||||
lowered = token_text.lower()
|
||||
tokens.append(
|
||||
FakeToken(
|
||||
text=token_text,
|
||||
pos=self.tag_map.get(lowered, "NOUN"),
|
||||
lemma=lowered,
|
||||
is_stop=lowered in self.stopwords,
|
||||
)
|
||||
)
|
||||
return tokens
|
||||
|
||||
|
||||
class TestableKeywordExtractor(KeywordExtractor):
|
||||
def __init__(self, *args, yake_scores=None, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._test_yake_scores = yake_scores or {}
|
||||
|
||||
def _extract_yake_scores(self, text: str):
|
||||
return self._test_yake_scores
|
||||
|
||||
|
||||
class KeywordExtractorTests(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls) -> None:
|
||||
tag_map = {
|
||||
"fiery": "ADJ",
|
||||
"dragon": "NOUN",
|
||||
"attack": "VERB",
|
||||
"explosive": "ADJ",
|
||||
"flames": "NOUN",
|
||||
"burning": "ADJ",
|
||||
"creature": "NOUN",
|
||||
"with": "ADP",
|
||||
"blaze": "NOUN",
|
||||
"and": "CCONJ",
|
||||
"dangerous": "ADJ",
|
||||
"electric": "ADJ",
|
||||
"mouse": "NOUN",
|
||||
"using": "VERB",
|
||||
"thunder": "NOUN",
|
||||
"shock": "NOUN",
|
||||
"strong": "ADJ",
|
||||
"furret": "NOUN",
|
||||
"long": "ADJ",
|
||||
"slender": "ADJ",
|
||||
"soft": "ADJ",
|
||||
"fur": "NOUN",
|
||||
"flexible": "ADJ",
|
||||
"body": "NOUN",
|
||||
"move": "VERB",
|
||||
"gracefully": "ADJ",
|
||||
"narrow": "ADJ",
|
||||
"tunnel": "NOUN",
|
||||
"tail": "NOUN",
|
||||
"smash": "VERB",
|
||||
"opponent": "NOUN",
|
||||
"battle": "NOUN",
|
||||
"cheerful": "ADJ",
|
||||
"endurance": "NOUN",
|
||||
}
|
||||
|
||||
stopwords = {
|
||||
"a",
|
||||
"very",
|
||||
"and",
|
||||
"with",
|
||||
"the",
|
||||
"it",
|
||||
"to",
|
||||
"its",
|
||||
"that",
|
||||
"through",
|
||||
"in",
|
||||
}
|
||||
cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
|
||||
cls.extractor = KeywordExtractor(nlp=cls.nlp, use_yake=False)
|
||||
|
||||
def test_readme_main_example(self) -> None:
|
||||
text = "fiery dragon attack explosive flames"
|
||||
result = self.extractor.extract(text)
|
||||
self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])
|
||||
|
||||
def test_synonym_normalization(self) -> None:
|
||||
text = "burning creature with blaze power"
|
||||
result = self.extractor.extract(text)
|
||||
self.assertEqual(result, ["fire", "creature", "power"])
|
||||
|
||||
def test_mixed_types(self) -> None:
|
||||
text = "electric mouse using thunder shock"
|
||||
result = self.extractor.extract(text)
|
||||
self.assertEqual(result, ["electric", "mouse", "using"])
|
||||
|
||||
def test_noise_input(self) -> None:
|
||||
text = "a very very strong and dangerous creature"
|
||||
result = self.extractor.extract(text)
|
||||
self.assertEqual(result, ["strong", "dangerous", "creature"])
|
||||
|
||||
def test_yake_keeps_detailed_information(self) -> None:
|
||||
text = (
|
||||
"furret long slender creature soft fur flexible body move gracefully narrow tunnel "
|
||||
"tail smash opponent battle cheerful endurance"
|
||||
)
|
||||
|
||||
yake_scores = {
|
||||
"furret": 0.00,
|
||||
"creature": 0.05,
|
||||
"tail": 0.08,
|
||||
"battle": 0.10,
|
||||
"smash": 0.12,
|
||||
"tunnel": 0.14,
|
||||
"endurance": 0.18,
|
||||
"body": 0.20,
|
||||
"cheerful": 0.22,
|
||||
"slender": 0.26,
|
||||
"flexible": 0.28,
|
||||
"gracefully": 0.34,
|
||||
"narrow": 0.40,
|
||||
"long": 0.42,
|
||||
"soft": 0.44,
|
||||
"fur": 0.45,
|
||||
"move": 0.48,
|
||||
"opponent": 0.52,
|
||||
}
|
||||
extractor = TestableKeywordExtractor(
|
||||
nlp=self.nlp,
|
||||
use_yake=True,
|
||||
keep_ratio=0.8,
|
||||
min_keywords=10,
|
||||
max_keywords=30,
|
||||
yake_scores=yake_scores,
|
||||
)
|
||||
|
||||
result = extractor.extract(text)
|
||||
|
||||
self.assertGreaterEqual(len(result), 10)
|
||||
self.assertIn("furret", result)
|
||||
self.assertIn("creature", result)
|
||||
self.assertIn("tail", result)
|
||||
self.assertIn("tunnel", result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
146
fetch_card.py
Normal file
146
fetch_card.py
Normal file
@@ -0,0 +1,146 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download Pokémon TCG card images with embedded JSON metadata.
|
||||
|
||||
Uses the TCGdex SDK to:
|
||||
1. List all sets (with configurable limit)
|
||||
2. For each set, list all cards (with configurable limit)
|
||||
3. Download each card image (PNG) and embed full card data as PNG metadata
|
||||
"""
|
||||
|
||||
import json
|
||||
import io
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import asdict, is_dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image, PngImagePlugin
|
||||
from tcgdexsdk import TCGdex, Language
|
||||
from tcgdexsdk.enums import Quality, Extension
|
||||
|
||||
# ── Configuration ──────────────────────────────────────────────
|
||||
MAX_SETS = 10000 # Number of sets to process (None = all)
|
||||
MAX_CARDS_PER_SET = 10000 # Number of cards per set (None = all)
|
||||
OUTPUT_DIR = Path(__file__).resolve().parent / "cards"
|
||||
IMAGE_QUALITY = Quality.HIGH
|
||||
MAX_WORKERS = 8 # Parallel download threads
|
||||
# ───────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def card_to_dict(card) -> dict:
|
||||
"""Convert a card object to a JSON-serialisable dict, skipping SDK internals."""
|
||||
data = {}
|
||||
skip = {"sdk", "get_image", "get_image_url"}
|
||||
for attr in dir(card):
|
||||
if attr.startswith("_") or attr in skip:
|
||||
continue
|
||||
val = getattr(card, attr, None)
|
||||
if callable(val):
|
||||
continue
|
||||
data[attr] = _serialise(val)
|
||||
return data
|
||||
|
||||
|
||||
def _serialise(obj):
|
||||
"""Recursively convert dataclass / nested objects to plain dicts."""
|
||||
if obj is None or isinstance(obj, (str, int, float, bool)):
|
||||
return obj
|
||||
if is_dataclass(obj) and not isinstance(obj, type):
|
||||
return {
|
||||
k: _serialise(v)
|
||||
for k, v in asdict(obj).items()
|
||||
if k != "sdk"
|
||||
}
|
||||
if isinstance(obj, list):
|
||||
return [_serialise(i) for i in obj]
|
||||
if isinstance(obj, dict):
|
||||
return {k: _serialise(v) for k, v in obj.items()}
|
||||
# Fallback: try dataclass-style attribute extraction
|
||||
if hasattr(obj, "__dict__"):
|
||||
return {
|
||||
k: _serialise(v)
|
||||
for k, v in obj.__dict__.items()
|
||||
if k != "sdk"
|
||||
}
|
||||
return str(obj)
|
||||
|
||||
|
||||
def save_image_with_metadata(image_bytes: bytes, metadata: dict, path: Path):
|
||||
"""Save a PNG image with JSON metadata embedded in a tEXt chunk."""
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
|
||||
png_info = PngImagePlugin.PngInfo()
|
||||
png_info.add_text("pokemon_metadata", json.dumps(metadata, ensure_ascii=False))
|
||||
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
img.save(str(path), "PNG", pnginfo=png_info)
|
||||
|
||||
|
||||
def process_card(card_id: str, set_dir: Path) -> str | None:
|
||||
"""Fetch card data + image and save. Returns card description on success."""
|
||||
sdk = TCGdex(Language.EN)
|
||||
card = sdk.card.getSync(card_id)
|
||||
if not card:
|
||||
return None
|
||||
|
||||
resp = card.get_image(IMAGE_QUALITY, Extension.PNG)
|
||||
image_bytes = resp.read()
|
||||
|
||||
metadata = card_to_dict(card)
|
||||
filename = f"{card.localId}.png"
|
||||
save_image_with_metadata(image_bytes, metadata, set_dir / filename)
|
||||
|
||||
return f"{card.name} ({card.id})"
|
||||
|
||||
|
||||
def main():
|
||||
sdk = TCGdex(Language.EN)
|
||||
|
||||
# 1. Get sets
|
||||
all_sets = sdk.set.listSync()
|
||||
if not all_sets:
|
||||
print("No sets returned.")
|
||||
return
|
||||
|
||||
sets_to_process = all_sets[:MAX_SETS] if MAX_SETS else all_sets
|
||||
print(f"Processing {len(sets_to_process)} / {len(all_sets)} sets\n")
|
||||
|
||||
total_downloaded = 0
|
||||
|
||||
for si, set_resume in enumerate(sets_to_process, 1):
|
||||
full_set = sdk.set.getSync(set_resume.id)
|
||||
if not full_set or not full_set.cards:
|
||||
print(f"[{si}] {set_resume.name}: no cards, skipping")
|
||||
continue
|
||||
|
||||
cards = full_set.cards[:MAX_CARDS_PER_SET] if MAX_CARDS_PER_SET else full_set.cards
|
||||
card_total = full_set.cardCount.total if full_set.cardCount else len(full_set.cards)
|
||||
print(f"[{si}/{len(sets_to_process)}] {set_resume.name} — {len(cards)}/{card_total} cards")
|
||||
|
||||
set_dir = OUTPUT_DIR / set_resume.id
|
||||
|
||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
|
||||
futures = {
|
||||
pool.submit(process_card, cr.id, set_dir): cr.id
|
||||
for cr in cards
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
card_id = futures[future]
|
||||
try:
|
||||
result = future.result()
|
||||
if result:
|
||||
total_downloaded += 1
|
||||
print(f" {result} ✓")
|
||||
else:
|
||||
print(f" {card_id}: skipped")
|
||||
except Exception as e:
|
||||
print(f" {card_id}: failed ({e})")
|
||||
|
||||
print()
|
||||
|
||||
print(f"Done — {total_downloaded} cards saved to {OUTPUT_DIR}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
BIN
pokeball.png
Normal file
BIN
pokeball.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.6 KiB |
742
pokemon_card_training_2.ipynb
Normal file
742
pokemon_card_training_2.ipynb
Normal file
File diff suppressed because one or more lines are too long
346
prompt_to_card_pipeline.py
Normal file
346
prompt_to_card_pipeline.py
Normal file
@@ -0,0 +1,346 @@
|
||||
"""End-to-end prompt -> cleaned text -> inferred JSON -> generated card image.
|
||||
|
||||
This script is built to connect the three stages described by the user:
|
||||
1) call get_clean_text(user_text) from a text-cleaning module file
|
||||
2) pass cleaned text into infer_json_usage.py with --json-only --template
|
||||
3) load a checkpoint and generate a card image from inferred metadata
|
||||
|
||||
The model-loading part is intentionally pluggable because checkpoint structures vary.
|
||||
If your .pt checkpoint cannot be used directly as a callable pipeline, provide a
|
||||
generator module implementing:
|
||||
|
||||
def build_pipeline(checkpoint_path: str, device: str): ...
|
||||
def metadata_to_conditioning(meta: dict) -> str: ... # optional
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import importlib
|
||||
import importlib.util
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Mapping
|
||||
|
||||
|
||||
def _load_module_from_file(module_file: str):
|
||||
module_path = Path(module_file).resolve()
|
||||
if not module_path.exists():
|
||||
raise FileNotFoundError(f"Module file not found: {module_path}")
|
||||
|
||||
spec = importlib.util.spec_from_file_location(module_path.stem, str(module_path))
|
||||
if spec is None or spec.loader is None:
|
||||
raise ImportError(f"Cannot import module from file: {module_path}")
|
||||
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
print("module successfully charged")
|
||||
return module
|
||||
|
||||
|
||||
def _load_function_from_file(module_file: str, function_name: str) -> Callable[..., Any]:
|
||||
print("model charging 1")
|
||||
module = _load_module_from_file(module_file)
|
||||
print("model charged 1")
|
||||
if not hasattr(module, function_name):
|
||||
raise AttributeError(f"{module_file} has no function named '{function_name}'")
|
||||
func = getattr(module, function_name)
|
||||
if not callable(func):
|
||||
raise TypeError(f"{function_name} in {module_file} is not callable")
|
||||
return func
|
||||
|
||||
|
||||
def _extract_json_from_output(raw: str) -> Mapping[str, Any]:
|
||||
print("_extract_json_from_output")
|
||||
stripped = raw.strip()
|
||||
if not stripped:
|
||||
raise ValueError("Inference command returned empty output")
|
||||
|
||||
try:
|
||||
parsed = json.loads(stripped)
|
||||
if not isinstance(parsed, dict):
|
||||
raise ValueError("Inference output is JSON but not an object")
|
||||
return parsed
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fallback: parse the last JSON object in mixed stdout.
|
||||
last_open = stripped.rfind("{")
|
||||
last_close = stripped.rfind("}")
|
||||
if last_open == -1 or last_close == -1 or last_close <= last_open:
|
||||
raise ValueError(f"Could not parse JSON from inference output:\n{raw}")
|
||||
|
||||
candidate = stripped[last_open : last_close + 1]
|
||||
parsed = json.loads(candidate)
|
||||
print("json parsed with success")
|
||||
if not isinstance(parsed, dict):
|
||||
raise ValueError("Parsed fallback JSON is not an object")
|
||||
return parsed
|
||||
|
||||
|
||||
def run_infer_json_cli(
|
||||
infer_script_path: str,
|
||||
template_path: str,
|
||||
cleaned_text: str,
|
||||
python_executable: str | None = None,
|
||||
) -> Mapping[str, Any]:
|
||||
infer_script = Path(infer_script_path).resolve()
|
||||
print("run_infer_json_cli")
|
||||
if not infer_script.exists():
|
||||
raise FileNotFoundError(f"infer_json_usage.py not found: {infer_script}")
|
||||
|
||||
template_file = Path(template_path).resolve()
|
||||
if not template_file.exists():
|
||||
raise FileNotFoundError(f"Template file not found: {template_file}")
|
||||
|
||||
cmd = [
|
||||
python_executable or sys.executable,
|
||||
str(infer_script),
|
||||
"--json-only",
|
||||
"--template",
|
||||
str(template_file),
|
||||
cleaned_text,
|
||||
]
|
||||
print("will start result")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=False)
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.strip()
|
||||
raise RuntimeError(
|
||||
"JSON inference command failed. "
|
||||
f"exit={result.returncode}, stderr={stderr or '<empty>'}"
|
||||
)
|
||||
print("result is done")
|
||||
return _extract_json_from_output(result.stdout)
|
||||
|
||||
|
||||
def default_metadata_to_conditioning(meta: Mapping[str, Any]) -> str:
|
||||
print("default_metadata_to_conditioning")
|
||||
name = str(meta.get("name", "Unknown Pokemon"))
|
||||
types = meta.get("types") or []
|
||||
if isinstance(types, list):
|
||||
type_text = ", ".join(str(item) for item in types if item) or str(meta.get("type", "normal"))
|
||||
else:
|
||||
type_text = str(meta.get("type", "normal"))
|
||||
|
||||
attacks = meta.get("attacks") or []
|
||||
attack_names = []
|
||||
if isinstance(attacks, list):
|
||||
for attack in attacks:
|
||||
if isinstance(attack, dict):
|
||||
value = attack.get("name")
|
||||
if value:
|
||||
attack_names.append(str(value))
|
||||
elif attack:
|
||||
attack_names.append(str(attack))
|
||||
|
||||
hp = str(meta.get("hp", "60"))
|
||||
description = str(meta.get("description", ""))
|
||||
|
||||
parts = [
|
||||
f"Pokemon trading card illustration of {name}",
|
||||
f"type: {type_text}",
|
||||
f"hp: {hp}",
|
||||
]
|
||||
if attack_names:
|
||||
parts.append(f"attacks: {', '.join(attack_names[:2])}")
|
||||
if description:
|
||||
parts.append(f"description: {description}")
|
||||
return "; ".join(parts)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckpointCardGenerator:
|
||||
checkpoint_path: str
|
||||
device: str = "cpu"
|
||||
generator_module_path: str = ""
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self._pipe = self._build_pipe()
|
||||
self._metadata_to_conditioning = self._build_conditioning_function()
|
||||
|
||||
def _build_pipe(self):
|
||||
if self.generator_module_path:
|
||||
print("getting module")
|
||||
module = _load_module_from_file(self.generator_module_path)
|
||||
print("module got")
|
||||
if not hasattr(module, "build_pipeline"):
|
||||
raise AttributeError(
|
||||
"Custom generator module must define build_pipeline(checkpoint_path, device)."
|
||||
)
|
||||
print("building pipeline")
|
||||
build_pipeline = getattr(module, "build_pipeline")
|
||||
if not callable(build_pipeline):
|
||||
raise TypeError("build_pipeline exists but is not callable")
|
||||
print("pipeline build")
|
||||
return build_pipeline(self.checkpoint_path, self.device)
|
||||
|
||||
# Best-effort direct checkpoint loading for simple callable pipeline dumps.
|
||||
try:
|
||||
torch = importlib.import_module("torch")
|
||||
except ModuleNotFoundError as exc:
|
||||
raise RuntimeError(
|
||||
"torch is required to load checkpoint files. Install torch or provide --generator-module."
|
||||
) from exc
|
||||
print("loading checkpoint")
|
||||
checkpoint = torch.load(self.checkpoint_path, map_location=self.device)
|
||||
print("checkpoint loaded")
|
||||
|
||||
if callable(checkpoint):
|
||||
return checkpoint
|
||||
|
||||
if isinstance(checkpoint, dict):
|
||||
for key in ("pipe", "pipeline", "model"):
|
||||
candidate = checkpoint.get(key)
|
||||
if callable(candidate):
|
||||
return candidate
|
||||
|
||||
raise RuntimeError(
|
||||
"Could not construct a callable generation pipeline from checkpoint. "
|
||||
"Pass --generator-module with a build_pipeline() function for your model layout."
|
||||
)
|
||||
|
||||
def _build_conditioning_function(self) -> Callable[[Mapping[str, Any]], str]:
|
||||
if self.generator_module_path:
|
||||
print("model charge 2")
|
||||
module = _load_module_from_file(self.generator_module_path)
|
||||
print("model charged 2")
|
||||
if hasattr(module, "metadata_to_conditioning"):
|
||||
func = getattr(module, "metadata_to_conditioning")
|
||||
if callable(func):
|
||||
return func
|
||||
return default_metadata_to_conditioning
|
||||
|
||||
def generate_card_from_metadata(
|
||||
self,
|
||||
meta: Mapping[str, Any],
|
||||
num_inference_steps: int = 30,
|
||||
guidance_scale: float = 7.5,
|
||||
save_path: str | None = None,
|
||||
):
|
||||
conditioning = self._metadata_to_conditioning(meta)
|
||||
result = self._pipe(
|
||||
conditioning,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
)
|
||||
|
||||
if not hasattr(result, "images") or not result.images:
|
||||
raise RuntimeError(
|
||||
"Pipeline call did not return an object with non-empty .images. "
|
||||
"Ensure your pipeline follows diffusers-style output."
|
||||
)
|
||||
|
||||
image = result.images[0]
|
||||
if save_path:
|
||||
output_file = Path(save_path).resolve()
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
image.save(str(output_file))
|
||||
return image
|
||||
|
||||
|
||||
def _build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run text cleaning + JSON inference + card generation in one command.",
|
||||
)
|
||||
parser.add_argument("text", help="User input text.")
|
||||
parser.add_argument(
|
||||
"--text-cleaner-path",
|
||||
required=True,
|
||||
help="Path to text-cleaning-pipeline.py that defines get_clean_text(text).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--infer-script-path",
|
||||
required=True,
|
||||
help="Path to infer_json_usage.py.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--template",
|
||||
required=True,
|
||||
help="Path to JSON template file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--checkpoint",
|
||||
required=True,
|
||||
help="Path to model checkpoint (example: pokemon_card_lora/training_history.pt).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--generator-module",
|
||||
default="",
|
||||
help="Optional module path defining build_pipeline() and metadata_to_conditioning().",
|
||||
)
|
||||
parser.add_argument("--device", default="cpu", help="Checkpoint loading device (default: cpu).")
|
||||
parser.add_argument("--num-inference-steps", type=int, default=30)
|
||||
parser.add_argument("--guidance-scale", type=float, default=7.5)
|
||||
parser.add_argument("--save-path", default="generated_card.png")
|
||||
parser.add_argument(
|
||||
"--python-executable",
|
||||
default=sys.executable,
|
||||
help="Python executable used to run infer_json_usage.py (default: current interpreter).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-json",
|
||||
action="store_true",
|
||||
help="Print inferred JSON to stdout.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-clean-text",
|
||||
action="store_true",
|
||||
help="Print cleaned text to stdout.",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = _build_parser().parse_args()
|
||||
print("main get clean text")
|
||||
|
||||
get_clean_text = _load_function_from_file(args.text_cleaner_path, "get_clean_text")
|
||||
print("main got clean text")
|
||||
|
||||
cleaned_text = get_clean_text(args.text)
|
||||
print("main got args.text")
|
||||
if not isinstance(cleaned_text, str):
|
||||
raise TypeError("get_clean_text(...) must return a string")
|
||||
print("main get inferred")
|
||||
|
||||
inferred_json = run_infer_json_cli(
|
||||
infer_script_path=args.infer_script_path,
|
||||
template_path=args.template,
|
||||
cleaned_text=cleaned_text,
|
||||
python_executable=args.python_executable,
|
||||
)
|
||||
print("main got inferred")
|
||||
print("main get generator")
|
||||
|
||||
|
||||
|
||||
generator = CheckpointCardGenerator(
|
||||
checkpoint_path=args.checkpoint,
|
||||
device=args.device,
|
||||
generator_module_path=args.generator_module,
|
||||
)
|
||||
print("main got generator and will generate card")
|
||||
|
||||
generator.generate_card_from_metadata(
|
||||
inferred_json,
|
||||
num_inference_steps=args.num_inference_steps,
|
||||
guidance_scale=args.guidance_scale,
|
||||
save_path=args.save_path,
|
||||
)
|
||||
print("main card generated")
|
||||
|
||||
|
||||
if args.print_clean_text:
|
||||
print(cleaned_text)
|
||||
if args.print_json:
|
||||
print(json.dumps(inferred_json, indent=2))
|
||||
|
||||
print(f"Card generated and saved to: {args.save_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,298 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 🎴 Génération de Carte Pokémon depuis un Texte Descriptif\n",
|
||||
"## Partie 1 — Nettoyage du Texte (NLU Pipeline)\n",
|
||||
"\n",
|
||||
"On prend un texte descriptif fourni par l'utilisateur et on le nettoie étape par étape.\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"Texte brut → Noise Removal → Tokenization → Stopwords → Lemmatization → Texte propre\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 📦 Installation des dépendances"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31mRunning cells with 'Python 3.12.3' requires the ipykernel package.\n",
|
||||
"\u001b[1;31m<a href='command:jupyter.createPythonEnvAndSelectController'>Create a Python Environment</a> with the required packages.\n",
|
||||
"\u001b[1;31mOr install 'ipykernel' using the command: '/usr/bin/python3 -m pip install ipykernel -U --user --force-reinstall'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install nltk --quiet\n",
|
||||
"\n",
|
||||
"import nltk\n",
|
||||
"nltk.download('punkt', quiet=True)\n",
|
||||
"nltk.download('punkt_tab', quiet=True)\n",
|
||||
"nltk.download('stopwords', quiet=True)\n",
|
||||
"nltk.download('wordnet', quiet=True)\n",
|
||||
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
|
||||
"nltk.download('averaged_perceptron_tagger_eng', quiet=True)\n",
|
||||
"\n",
|
||||
"print(\"✅ Dépendances installées !\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 📝 Saisie du texte utilisateur"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"raw_text = \"\"\"\n",
|
||||
"This is a HUGE fire dragon!!! It has got massive red wings and shoots \n",
|
||||
"powerfull flames from its mouth... It's super fast n really strong!!\n",
|
||||
"Its body is coverd with shiny golden scales & it lives in volcanos.\n",
|
||||
"it luv to fight other pokémons and is very very aggressive >:(\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"print(\"📄 Texte brut :\")\n",
|
||||
"print(raw_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 🧹 Étape 1 — Noise Removal\n",
|
||||
"\n",
|
||||
"On supprime la ponctuation, les caractères spéciaux, les mots trop courts, et on met tout en minuscules.\n",
|
||||
"\n",
|
||||
"> 📖 *Cours page 25-29 — `removePunctuation`, `removeShortWords`, `removePattern`*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"import string\n",
|
||||
"\n",
|
||||
"def remove_punctuation(text):\n",
|
||||
" \"\"\"Supprime la ponctuation du texte.\"\"\"\n",
|
||||
" mapping_table = text.maketrans('', '', string.punctuation)\n",
|
||||
" return text.translate(mapping_table)\n",
|
||||
"\n",
|
||||
"def remove_special_chars(text):\n",
|
||||
" \"\"\"Supprime les caractères non-ASCII (emojis, accents parasites...).\"\"\"\n",
|
||||
" text = text.encode('ascii', 'ignore').decode('ascii')\n",
|
||||
" text = re.sub(r'[^a-zA-Z\\s]', ' ', text)\n",
|
||||
" return re.sub(r'\\s+', ' ', text).strip()\n",
|
||||
"\n",
|
||||
"def remove_short_words(text, min_len=3):\n",
|
||||
" \"\"\"Supprime les mots de moins de min_len caractères.\"\"\"\n",
|
||||
" return \" \".join([word for word in text.split() if len(word) >= min_len])\n",
|
||||
"\n",
|
||||
"# Application\n",
|
||||
"text = raw_text.lower() # minuscules\n",
|
||||
"text = remove_punctuation(text) # ponctuation\n",
|
||||
"text = remove_special_chars(text) # caractères spéciaux\n",
|
||||
"text = remove_short_words(text) # mots trop courts\n",
|
||||
"\n",
|
||||
"print(\"🔇 Après Noise Removal :\")\n",
|
||||
"print(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 📖 Étape 2 — Object Standardization\n",
|
||||
"\n",
|
||||
"On remplace les abréviations et l'argot par leurs formes standard.\n",
|
||||
"\n",
|
||||
"> 📖 *Cours page 38 — lookup table `standardize`*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"SLANG_LOOKUP = {\n",
|
||||
" \"n\": \"and\",\n",
|
||||
" \"luv\": \"love\",\n",
|
||||
" \"r\": \"are\",\n",
|
||||
" \"u\": \"you\",\n",
|
||||
" \"ur\": \"your\",\n",
|
||||
" \"gonna\": \"going to\",\n",
|
||||
" \"wanna\": \"want to\",\n",
|
||||
" \"gotta\": \"got to\",\n",
|
||||
" \"pokemons\": \"pokemon\",\n",
|
||||
" \"pokmons\": \"pokemon\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"def standardize(text, lookup=SLANG_LOOKUP):\n",
|
||||
" \"\"\"Remplace les mots d'argot par leur forme standard.\"\"\"\n",
|
||||
" words = text.split()\n",
|
||||
" return \" \".join([lookup.get(word, word) for word in words])\n",
|
||||
"\n",
|
||||
"text = standardize(text)\n",
|
||||
"\n",
|
||||
"print(\"📖 Après Standardisation :\")\n",
|
||||
"print(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## ✂️ Étape 3 — Tokenization\n",
|
||||
"\n",
|
||||
"On découpe le texte en tokens individuels.\n",
|
||||
"\n",
|
||||
"> 📖 *Cours page 31 — `word_tokenize` (NLTK)*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from nltk import word_tokenize\n",
|
||||
"\n",
|
||||
"tokens = word_tokenize(text)\n",
|
||||
"\n",
|
||||
"print(f\"✂️ {len(tokens)} tokens :\")\n",
|
||||
"print(tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 🚫 Étape 4 — Suppression des Stopwords\n",
|
||||
"\n",
|
||||
"On retire les mots grammaticaux qui n'apportent pas de sens (\"the\", \"is\", \"a\"...).\n",
|
||||
"\n",
|
||||
"> 📖 *Cours page 27 — `cleanTextGT` avec `stopwords` (NLTK)*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from nltk.corpus import stopwords\n",
|
||||
"\n",
|
||||
"stop_words = set(stopwords.words('english'))\n",
|
||||
"\n",
|
||||
"tokens = [token for token in tokens if token not in stop_words]\n",
|
||||
"\n",
|
||||
"print(\"🚫 Tokens après suppression des stopwords :\")\n",
|
||||
"print(tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## 🌿 Étape 5 — Lemmatization\n",
|
||||
"\n",
|
||||
"On réduit chaque mot à sa forme racine (`flames → flame`, `shooting → shoot`). On utilise le POS tag pour plus de précision.\n",
|
||||
"\n",
|
||||
"> 📖 *Cours page 36-37 — `WordNetLemmatizer` + POS tag*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from nltk.stem.wordnet import WordNetLemmatizer\n",
|
||||
"from nltk import pos_tag\n",
|
||||
"from nltk.corpus import wordnet\n",
|
||||
"\n",
|
||||
"lem = WordNetLemmatizer()\n",
|
||||
"\n",
|
||||
"def get_wordnet_pos(treebank_tag):\n",
|
||||
" \"\"\"Convertit les tags Penn Treebank en tags WordNet.\"\"\"\n",
|
||||
" if treebank_tag.startswith('J'): return wordnet.ADJ\n",
|
||||
" elif treebank_tag.startswith('V'): return wordnet.VERB\n",
|
||||
" elif treebank_tag.startswith('N'): return wordnet.NOUN\n",
|
||||
" elif treebank_tag.startswith('R'): return wordnet.ADV\n",
|
||||
" else: return wordnet.NOUN\n",
|
||||
"\n",
|
||||
"pos_tags = pos_tag(tokens)\n",
|
||||
"tokens = [lem.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]\n",
|
||||
"\n",
|
||||
"print(\"🌿 Tokens après Lemmatization :\")\n",
|
||||
"print(tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## ✅ Résultat final — Texte nettoyé"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clean_text = \" \".join(tokens)\n",
|
||||
"\n",
|
||||
"print(\"📄 Texte brut :\")\n",
|
||||
"print(raw_text.strip())\n",
|
||||
"print()\n",
|
||||
"print(\"✅ Texte nettoyé :\")\n",
|
||||
"print(clean_text)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,158 @@
|
||||
"""Reusable text-cleaning pipeline for Pokemon descriptions.
|
||||
|
||||
This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API:
|
||||
- no input() calls
|
||||
- no print side effects
|
||||
- deterministic output for a given input
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import string
|
||||
from typing import Any, Dict, List
|
||||
|
||||
SLANG_LOOKUP: Dict[str, str] = {
|
||||
"n": "and",
|
||||
"luv": "love",
|
||||
"r": "are",
|
||||
"u": "you",
|
||||
"ur": "your",
|
||||
"gonna": "going to",
|
||||
"wanna": "want to",
|
||||
"gotta": "got to",
|
||||
"pokemons": "pokemon",
|
||||
"pokmons": "pokemon",
|
||||
"bcz": "because",
|
||||
}
|
||||
|
||||
_NLTK_RESOURCES = [
|
||||
"punkt",
|
||||
"punkt_tab",
|
||||
"stopwords",
|
||||
"wordnet",
|
||||
"averaged_perceptron_tagger",
|
||||
"averaged_perceptron_tagger_eng",
|
||||
]
|
||||
|
||||
|
||||
def _import_nltk() -> Any:
|
||||
"""Import NLTK lazily so this module can be imported before deps are installed."""
|
||||
try:
|
||||
import nltk # type: ignore
|
||||
except ModuleNotFoundError as exc:
|
||||
raise RuntimeError(
|
||||
"NLTK is not installed. Install project dependencies with: pip install -r requirements.txt"
|
||||
) from exc
|
||||
return nltk
|
||||
|
||||
|
||||
def ensure_nltk_resources(quiet: bool = True) -> None:
|
||||
"""Download required NLTK resources if missing.
|
||||
|
||||
Safe to call at app startup (including inside Streamlit).
|
||||
"""
|
||||
nltk = _import_nltk()
|
||||
for resource in _NLTK_RESOURCES:
|
||||
try:
|
||||
nltk.download(resource, quiet=quiet)
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc
|
||||
|
||||
|
||||
def remove_punctuation(text: str) -> str:
|
||||
mapping_table = text.maketrans("", "", string.punctuation)
|
||||
return text.translate(mapping_table)
|
||||
|
||||
|
||||
def remove_special_chars(text: str) -> str:
|
||||
text = text.encode("ascii", "ignore").decode("ascii")
|
||||
text = re.sub(r"[^a-zA-Z\s]", " ", text)
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def remove_short_words(text: str, min_len: int = 3) -> str:
|
||||
return " ".join(word for word in text.split() if len(word) >= min_len)
|
||||
|
||||
|
||||
def remove_alphanum_words(text: str) -> str:
|
||||
words = text.split()
|
||||
cleaned = [
|
||||
word
|
||||
for word in words
|
||||
if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word))
|
||||
]
|
||||
return " ".join(cleaned)
|
||||
|
||||
|
||||
def standardize(text: str, lookup: Dict[str, str] | None = None) -> str:
|
||||
mapping = lookup or SLANG_LOOKUP
|
||||
return " ".join(mapping.get(word, word) for word in text.split())
|
||||
|
||||
|
||||
def _get_wordnet_pos(treebank_tag: str) -> str:
|
||||
nltk = _import_nltk()
|
||||
wordnet = nltk.corpus.wordnet
|
||||
if treebank_tag.startswith("J"):
|
||||
return wordnet.ADJ
|
||||
if treebank_tag.startswith("V"):
|
||||
return wordnet.VERB
|
||||
if treebank_tag.startswith("N"):
|
||||
return wordnet.NOUN
|
||||
if treebank_tag.startswith("R"):
|
||||
return wordnet.ADV
|
||||
return wordnet.NOUN
|
||||
|
||||
|
||||
def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]:
|
||||
"""Run the full cleaning pipeline and return intermediate + final outputs.
|
||||
|
||||
Returns a dictionary so a UI can display each stage if desired.
|
||||
"""
|
||||
if not isinstance(raw_text, str):
|
||||
raise TypeError("raw_text must be a string")
|
||||
|
||||
nltk = _import_nltk()
|
||||
pos_tag = nltk.pos_tag
|
||||
word_tokenize = nltk.word_tokenize
|
||||
stopwords = nltk.corpus.stopwords
|
||||
WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer
|
||||
|
||||
ensure_nltk_resources(quiet=True)
|
||||
|
||||
text = raw_text.lower()
|
||||
text = remove_punctuation(text)
|
||||
text = remove_alphanum_words(text)
|
||||
text = remove_special_chars(text)
|
||||
noise_removed = remove_short_words(text, min_len=min_len)
|
||||
|
||||
standardized = standardize(noise_removed)
|
||||
|
||||
tokens = word_tokenize(standardized)
|
||||
|
||||
stop_words = set(stopwords.words("english"))
|
||||
tokens_no_stopwords = [token for token in tokens if token not in stop_words]
|
||||
|
||||
lem = WordNetLemmatizer()
|
||||
pos_tags = pos_tag(tokens_no_stopwords)
|
||||
lemmas = [
|
||||
lem.lemmatize(token, _get_wordnet_pos(tag))
|
||||
for token, tag in pos_tags
|
||||
]
|
||||
|
||||
clean_text = " ".join(lemmas)
|
||||
|
||||
return {
|
||||
"raw_text": raw_text,
|
||||
"noise_removed": noise_removed,
|
||||
"standardized": standardized,
|
||||
"tokens": tokens,
|
||||
"tokens_no_stopwords": tokens_no_stopwords,
|
||||
"lemmas": lemmas,
|
||||
"clean_text": clean_text,
|
||||
}
|
||||
|
||||
|
||||
def get_clean_text(raw_text: str, min_len: int = 3) -> str:
|
||||
"""Small helper for app code that only needs the final cleaned text."""
|
||||
return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]
|
||||
BIN
text-cleaner/__pycache__/text_cleaning_pipeline.cpython-312.pyc
Normal file
BIN
text-cleaner/__pycache__/text_cleaning_pipeline.cpython-312.pyc
Normal file
Binary file not shown.
451
text-cleaner/pokemon_text_cleaning.ipynb
Normal file
451
text-cleaner/pokemon_text_cleaning.ipynb
Normal file
@@ -0,0 +1,451 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Partie 1 — Nettoyage du Texte\n",
|
||||
"source .venv/bin/activate\n",
|
||||
"cd \n",
|
||||
"python nom du fichier\n",
|
||||
"On prend un texte descriptif fourni par l'utilisateur et on le nettoie étape par étape.\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"Texte brut → Noise Removal → Tokenization → Stopwords → Lemmatization → Texte propre\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## Installation des dépendances"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dépendances installées !\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install nltk --quiet\n",
|
||||
"\n",
|
||||
"import nltk\n",
|
||||
"nltk.download('punkt', quiet=True)\n",
|
||||
"nltk.download('punkt_tab', quiet=True)\n",
|
||||
"nltk.download('stopwords', quiet=True)\n",
|
||||
"nltk.download('wordnet', quiet=True)\n",
|
||||
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
|
||||
"nltk.download('averaged_perceptron_tagger_eng', quiet=True)\n",
|
||||
"\n",
|
||||
"print(\"Dépendances installées !\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## Saisie du texte utilisateur"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 86,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "Interrupted by user",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[86]\u001b[39m\u001b[32m, line 65\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# test_texts = [\u001b[39;00m\n\u001b[32m 2\u001b[39m \n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# # 0 — Dragon de feu (texte original)\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 62\u001b[39m \n\u001b[32m 63\u001b[39m \u001b[38;5;66;03m# print(f\" Texte de test n°{INDEX} :\")\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m65\u001b[39m raw_text = \u001b[38;5;28;43minput\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mDécrivez votre Pokémon : \u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 67\u001b[39m \u001b[38;5;28mprint\u001b[39m(raw_text)\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/lib/python3.12/site-packages/ipykernel/kernelbase.py:1403\u001b[39m, in \u001b[36mKernel.raw_input\u001b[39m\u001b[34m(self, prompt)\u001b[39m\n\u001b[32m 1401\u001b[39m msg = \u001b[33m\"\u001b[39m\u001b[33mraw_input was called, but this frontend does not support input requests.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1402\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m StdinNotImplementedError(msg)\n\u001b[32m-> \u001b[39m\u001b[32m1403\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_input_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1404\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1405\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_get_shell_context_var\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_shell_parent_ident\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1406\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_parent\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mshell\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1407\u001b[39m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 1408\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/lib/python3.12/site-packages/ipykernel/kernelbase.py:1448\u001b[39m, in \u001b[36mKernel._input_request\u001b[39m\u001b[34m(self, prompt, ident, parent, password)\u001b[39m\n\u001b[32m 1445\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[32m 1446\u001b[39m \u001b[38;5;66;03m# re-raise KeyboardInterrupt, to truncate traceback\u001b[39;00m\n\u001b[32m 1447\u001b[39m msg = \u001b[33m\"\u001b[39m\u001b[33mInterrupted by user\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m-> \u001b[39m\u001b[32m1448\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m(msg) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1449\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[32m 1450\u001b[39m \u001b[38;5;28mself\u001b[39m.log.warning(\u001b[33m\"\u001b[39m\u001b[33mInvalid Message:\u001b[39m\u001b[33m\"\u001b[39m, exc_info=\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
|
||||
"\u001b[31mKeyboardInterrupt\u001b[39m: Interrupted by user"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# test_texts = [\n",
|
||||
"\n",
|
||||
"# # 0 — Dragon de feu (texte original)\n",
|
||||
"# \"\"\"\n",
|
||||
"# This is a HUGE fire dragon!!! It has got massive red wings and shoots\n",
|
||||
"# powerfull flames from its mouth... It's super fast n really strong!!\n",
|
||||
"# Its body is coverd with shiny golden scales & it lives in volcanos.\n",
|
||||
"# it luv to fight other pokémons and is very very aggressive >:(\n",
|
||||
"# I want to call it Pyrokar.\n",
|
||||
"# \"\"\",\n",
|
||||
"\n",
|
||||
"# # 1 — Pokémon aquatique calme\n",
|
||||
"# \"\"\"\n",
|
||||
"# My pokemon is called Aqualis!! its a small blue sea creature w/ big\n",
|
||||
"# shiny eyes... very calm n gentle :) it swims super fast in deep oceans\n",
|
||||
"# and can breath underwater 4ever. it glows in the dark like a lanternfish\n",
|
||||
"# and heals other pokemons with its tears!!! luv this lil guy so much omg\n",
|
||||
"# \"\"\",\n",
|
||||
"\n",
|
||||
"# # 2 — Pokémon électrique agressif\n",
|
||||
"# \"\"\"\n",
|
||||
"# ZAPTHORN is da name!! its an electric wolf w/ yellow n black fur and\n",
|
||||
"# giant thunder claws !!! it runz at lightning speed thru storms & shoots\n",
|
||||
"# bolts from its tail... super scary n powerfull enemy 4 sure >:D\n",
|
||||
"# nobody can catch it bcz it disappears in the clouds when threatened\n",
|
||||
"# \"\"\",\n",
|
||||
"\n",
|
||||
"# # 3 — Pokémon plante timide\n",
|
||||
"# \"\"\"\n",
|
||||
"# i wanna name it Sylverion... its a shy deer-like pokemon covered in\n",
|
||||
"# beautiful flowers n vines. it lives deep in enchanted forests & only\n",
|
||||
"# comes out at nite. its antlers r made of ancient wood n bloom every\n",
|
||||
"# spring!! it can make plants grow super fast around it... so magical omg\n",
|
||||
"# \"\"\",\n",
|
||||
"\n",
|
||||
"# # 4 — Pokémon glace / fantôme\n",
|
||||
"# \"\"\"\n",
|
||||
"# This haunted ice spirit is called Glacyra!!! it floats thru frozen\n",
|
||||
"# mountains leavin icy footprints everywhere... its body is trasnparent\n",
|
||||
"# like glass n u can see its frozen heart inside >< it whispers 2 trainers\n",
|
||||
"# in their sleep n freezes everything it touchez. very misunderstood tbh\n",
|
||||
"# \"\"\",\n",
|
||||
"\n",
|
||||
"# # 5 — Pokémon combat en franglais\n",
|
||||
"# \"\"\"\n",
|
||||
"# My Pokémon is called Ferroknux!! It's a big metal gorilla with\n",
|
||||
"# gigantic iron fists and super thick armor on its chest... it smashes\n",
|
||||
"# rocks with bare hands and trains all day, every day in the mountains!!\n",
|
||||
"# Very strong and very aggressive, but loyal to its trainer 4ever :)\n",
|
||||
"# \"\"\",\n",
|
||||
"# #6 \n",
|
||||
"# \"\"\"\n",
|
||||
"# Furret is a long, slender, and agile creature with soft fur and a flexible body that allows it to move gracefully through narrow tunnels and hidden pathways. This normal-type Pokémon builds intricate nests perfectly shaped to fit its elongated form, making them nearly impossible for other creatures to enter. Despite its gentle and calm nature, Furret can become surprisingly energetic in battle, using its powerful tail to smash opponents with swift and playful attacks. It is often seen wandering across fields and forests, curiously observing its surroundings, and it shares a close bond with its pre-evolution, Sentret. Known for its endurance and cheerful spirit, Furret can quickly recover its energy, always feeling fine and ready to continue exploring or fighting alongside its trainer.\n",
|
||||
"# \"\"\",\n",
|
||||
"\n",
|
||||
"# ]\n",
|
||||
"\n",
|
||||
"# # 👇 Changez cet index pour tester un autre texte\n",
|
||||
"# INDEX = 6\n",
|
||||
"\n",
|
||||
"# raw_text = test_texts[INDEX]\n",
|
||||
"\n",
|
||||
"# print(f\" Texte de test n°{INDEX} :\")\n",
|
||||
"\n",
|
||||
"raw_text = input(\"Décrivez votre Pokémon : \")\n",
|
||||
"\n",
|
||||
"print(raw_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## Étape 1 — Noise Removal\n",
|
||||
"\n",
|
||||
"On supprime la ponctuation, les caractères spéciaux, les mots trop courts, et on met tout en minuscules.\n",
|
||||
"\n",
|
||||
"> *Cours page 25-29 — `removePunctuation`, `removeShortWords`, `removePattern`*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 80,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Après Noise Removal :\n",
|
||||
"furret long slender and agile creature with soft fur and flexible body that allows move gracefully through narrow tunnels and hidden pathways this normaltype pokmon builds intricate nests perfectly shaped fit its elongated form making them nearly impossible for other creatures enter despite its gentle and calm nature furret can become surprisingly energetic battle using its powerful tail smash opponents with swift and playful attacks often seen wandering across fields and forests curiously observing its surroundings and shares close bond with its preevolution sentret known for its endurance and cheerful spirit furret can quickly recover its energy always feeling fine and ready continue exploring fighting alongside its trainer\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"import string\n",
|
||||
"\n",
|
||||
"def remove_punctuation(text):\n",
|
||||
" \"\"\"Supprime la ponctuation du texte.\"\"\"\n",
|
||||
" mapping_table = text.maketrans('', '', string.punctuation)\n",
|
||||
" return text.translate(mapping_table)\n",
|
||||
"\n",
|
||||
"def remove_special_chars(text):\n",
|
||||
" \"\"\"Supprime les caractères non-ASCII (emojis, accents parasites...).\"\"\"\n",
|
||||
" text = text.encode('ascii', 'ignore').decode('ascii')\n",
|
||||
" text = re.sub(r'[^a-zA-Z\\s]', ' ', text)\n",
|
||||
" return re.sub(r'\\s+', ' ', text).strip()\n",
|
||||
"\n",
|
||||
"def remove_short_words(text, min_len=3):\n",
|
||||
" \"\"\"Supprime les mots de moins de min_len caractères.\"\"\"\n",
|
||||
" return \" \".join([word for word in text.split() if len(word) >= min_len])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def remove_alphanum_words(text):\n",
|
||||
" \"\"\"Supprime les mots qui contiennent à la fois des lettres et des chiffres\n",
|
||||
" (ex: '4ever', 'n1', '2night', 'runz4', 'mp3').\"\"\"\n",
|
||||
" words = text.split()\n",
|
||||
" cleaned = [word for word in words\n",
|
||||
" if not (re.search(r'[a-zA-Z]', word) and re.search(r'[0-9]', word))]\n",
|
||||
" return \" \".join(cleaned)\n",
|
||||
"\n",
|
||||
"# Application\n",
|
||||
"text = raw_text.lower() # minuscules\n",
|
||||
"text = remove_punctuation(text) # ponctuation\n",
|
||||
"text = remove_alphanum_words(text) \n",
|
||||
"text = remove_special_chars(text) # caractères spéciaux\n",
|
||||
"text = remove_short_words(text) # mots trop courts\n",
|
||||
"\n",
|
||||
"print(\" Après Noise Removal :\")\n",
|
||||
"print(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## Étape 2 — Object Standardization\n",
|
||||
"\n",
|
||||
"On remplace les abréviations et l'argot par leurs formes standard.\n",
|
||||
"\n",
|
||||
"> *Cours page 38 — lookup table `standardize`*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 81,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Après Standardisation :\n",
|
||||
"furret long slender and agile creature with soft fur and flexible body that allows move gracefully through narrow tunnels and hidden pathways this normaltype pokmon builds intricate nests perfectly shaped fit its elongated form making them nearly impossible for other creatures enter despite its gentle and calm nature furret can become surprisingly energetic battle using its powerful tail smash opponents with swift and playful attacks often seen wandering across fields and forests curiously observing its surroundings and shares close bond with its preevolution sentret known for its endurance and cheerful spirit furret can quickly recover its energy always feeling fine and ready continue exploring fighting alongside its trainer\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"SLANG_LOOKUP = {\n",
|
||||
" \"n\": \"and\",\n",
|
||||
" \"luv\": \"love\",\n",
|
||||
" \"r\": \"are\",\n",
|
||||
" \"u\": \"you\",\n",
|
||||
" \"ur\": \"your\",\n",
|
||||
" \"gonna\": \"going to\",\n",
|
||||
" \"wanna\": \"want to\",\n",
|
||||
" \"gotta\": \"got to\",\n",
|
||||
" \"pokemons\": \"pokemon\",\n",
|
||||
" \"pokmons\": \"pokemon\",\n",
|
||||
" \"bcz\": \"because\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"def standardize(text, lookup=SLANG_LOOKUP):\n",
|
||||
" \"\"\"Remplace les mots d'argot par leur forme standard.\"\"\"\n",
|
||||
" words = text.split()\n",
|
||||
" return \" \".join([lookup.get(word, word) for word in words])\n",
|
||||
"\n",
|
||||
"text = standardize(text)\n",
|
||||
"\n",
|
||||
"print(\" Après Standardisation :\")\n",
|
||||
"print(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## Étape 3 — Tokenization\n",
|
||||
"\n",
|
||||
"On découpe le texte en tokens individuels.\n",
|
||||
"\n",
|
||||
"> *Cours page 31 — `word_tokenize` (NLTK)*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 82,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 108 tokens :\n",
|
||||
"['furret', 'long', 'slender', 'and', 'agile', 'creature', 'with', 'soft', 'fur', 'and', 'flexible', 'body', 'that', 'allows', 'move', 'gracefully', 'through', 'narrow', 'tunnels', 'and', 'hidden', 'pathways', 'this', 'normaltype', 'pokmon', 'builds', 'intricate', 'nests', 'perfectly', 'shaped', 'fit', 'its', 'elongated', 'form', 'making', 'them', 'nearly', 'impossible', 'for', 'other', 'creatures', 'enter', 'despite', 'its', 'gentle', 'and', 'calm', 'nature', 'furret', 'can', 'become', 'surprisingly', 'energetic', 'battle', 'using', 'its', 'powerful', 'tail', 'smash', 'opponents', 'with', 'swift', 'and', 'playful', 'attacks', 'often', 'seen', 'wandering', 'across', 'fields', 'and', 'forests', 'curiously', 'observing', 'its', 'surroundings', 'and', 'shares', 'close', 'bond', 'with', 'its', 'preevolution', 'sentret', 'known', 'for', 'its', 'endurance', 'and', 'cheerful', 'spirit', 'furret', 'can', 'quickly', 'recover', 'its', 'energy', 'always', 'feeling', 'fine', 'and', 'ready', 'continue', 'exploring', 'fighting', 'alongside', 'its', 'trainer']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from nltk import word_tokenize\n",
|
||||
"\n",
|
||||
"tokens = word_tokenize(text)\n",
|
||||
"\n",
|
||||
"print(f\" {len(tokens)} tokens :\")\n",
|
||||
"print(tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## Étape 4 — Suppression des Stopwords\n",
|
||||
"\n",
|
||||
"On retire les mots grammaticaux qui n'apportent pas de sens (\"the\", \"is\", \"a\"...).\n",
|
||||
"\n",
|
||||
"> *Cours page 27 — `cleanTextGT` avec `stopwords` (NLTK)*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 83,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tokens après suppression des stopwords :\n",
|
||||
"['furret', 'long', 'slender', 'agile', 'creature', 'soft', 'fur', 'flexible', 'body', 'allows', 'move', 'gracefully', 'narrow', 'tunnels', 'hidden', 'pathways', 'normaltype', 'pokmon', 'builds', 'intricate', 'nests', 'perfectly', 'shaped', 'fit', 'elongated', 'form', 'making', 'nearly', 'impossible', 'creatures', 'enter', 'despite', 'gentle', 'calm', 'nature', 'furret', 'become', 'surprisingly', 'energetic', 'battle', 'using', 'powerful', 'tail', 'smash', 'opponents', 'swift', 'playful', 'attacks', 'often', 'seen', 'wandering', 'across', 'fields', 'forests', 'curiously', 'observing', 'surroundings', 'shares', 'close', 'bond', 'preevolution', 'sentret', 'known', 'endurance', 'cheerful', 'spirit', 'furret', 'quickly', 'recover', 'energy', 'always', 'feeling', 'fine', 'ready', 'continue', 'exploring', 'fighting', 'alongside', 'trainer']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from nltk.corpus import stopwords\n",
|
||||
"\n",
|
||||
"stop_words = set(stopwords.words('english'))\n",
|
||||
"\n",
|
||||
"tokens = [token for token in tokens if token not in stop_words]\n",
|
||||
"\n",
|
||||
"print(\"Tokens après suppression des stopwords :\")\n",
|
||||
"print(tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## Étape 5 — Lemmatization\n",
|
||||
"\n",
|
||||
"On réduit chaque mot à sa forme racine (`flames → flame`, `shooting → shoot`). On utilise le POS tag pour plus de précision.\n",
|
||||
"\n",
|
||||
"> *Cours page 36-37 — `WordNetLemmatizer` + POS tag*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 84,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Tokens après Lemmatization :\n",
|
||||
"['furret', 'long', 'slender', 'agile', 'creature', 'soft', 'fur', 'flexible', 'body', 'allow', 'move', 'gracefully', 'narrow', 'tunnel', 'hide', 'pathway', 'normaltype', 'pokmon', 'build', 'intricate', 'nest', 'perfectly', 'shape', 'fit', 'elongated', 'form', 'make', 'nearly', 'impossible', 'creature', 'enter', 'despite', 'gentle', 'calm', 'nature', 'furret', 'become', 'surprisingly', 'energetic', 'battle', 'use', 'powerful', 'tail', 'smash', 'opponent', 'swift', 'playful', 'attack', 'often', 'see', 'wander', 'across', 'field', 'forest', 'curiously', 'observe', 'surroundings', 'share', 'close', 'bond', 'preevolution', 'sentret', 'know', 'endurance', 'cheerful', 'spirit', 'furret', 'quickly', 'recover', 'energy', 'always', 'feel', 'fine', 'ready', 'continue', 'explore', 'fight', 'alongside', 'trainer']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from nltk.stem.wordnet import WordNetLemmatizer\n",
|
||||
"from nltk import pos_tag\n",
|
||||
"from nltk.corpus import wordnet\n",
|
||||
"\n",
|
||||
"lem = WordNetLemmatizer()\n",
|
||||
"\n",
|
||||
"def get_wordnet_pos(treebank_tag):\n",
|
||||
" \"\"\"Convertit les tags Penn Treebank en tags WordNet.\"\"\"\n",
|
||||
" if treebank_tag.startswith('J'): return wordnet.ADJ\n",
|
||||
" elif treebank_tag.startswith('V'): return wordnet.VERB\n",
|
||||
" elif treebank_tag.startswith('N'): return wordnet.NOUN\n",
|
||||
" elif treebank_tag.startswith('R'): return wordnet.ADV\n",
|
||||
" else: return wordnet.NOUN\n",
|
||||
"\n",
|
||||
"pos_tags = pos_tag(tokens)\n",
|
||||
"tokens = [lem.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]\n",
|
||||
"\n",
|
||||
"print(\" Tokens après Lemmatization :\")\n",
|
||||
"print(tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## Résultat final — Texte nettoyé"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 85,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"📄 Texte brut :\n",
|
||||
"Furret is a long, slender, and agile creature with soft fur and a flexible body that allows it to move gracefully through narrow tunnels and hidden pathways. This normal-type Pokémon builds intricate nests perfectly shaped to fit its elongated form, making them nearly impossible for other creatures to enter. Despite its gentle and calm nature, Furret can become surprisingly energetic in battle, using its powerful tail to smash opponents with swift and playful attacks. It is often seen wandering across fields and forests, curiously observing its surroundings, and it shares a close bond with its pre-evolution, Sentret. Known for its endurance and cheerful spirit, Furret can quickly recover its energy, always feeling fine and ready to continue exploring or fighting alongside its trainer.\n",
|
||||
"\n",
|
||||
"Texte nettoyé :\n",
|
||||
"furret long slender agile creature soft fur flexible body allow move gracefully narrow tunnel hide pathway normaltype pokmon build intricate nest perfectly shape fit elongated form make nearly impossible creature enter despite gentle calm nature furret become surprisingly energetic battle use powerful tail smash opponent swift playful attack often see wander across field forest curiously observe surroundings share close bond preevolution sentret know endurance cheerful spirit furret quickly recover energy always feel fine ready continue explore fight alongside trainer\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"clean_text = \" \".join(tokens)\n",
|
||||
"\n",
|
||||
"print(\"📄 Texte brut :\")\n",
|
||||
"print(raw_text.strip())\n",
|
||||
"print()\n",
|
||||
"print(\"Texte nettoyé :\")\n",
|
||||
"print(clean_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
158
text-cleaner/text_cleaning_pipeline.py
Normal file
158
text-cleaner/text_cleaning_pipeline.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""Reusable text-cleaning pipeline for Pokemon descriptions.
|
||||
|
||||
This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API:
|
||||
- no input() calls
|
||||
- no print side effects
|
||||
- deterministic output for a given input
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import string
|
||||
from typing import Any, Dict, List
|
||||
|
||||
SLANG_LOOKUP: Dict[str, str] = {
|
||||
"n": "and",
|
||||
"luv": "love",
|
||||
"r": "are",
|
||||
"u": "you",
|
||||
"ur": "your",
|
||||
"gonna": "going to",
|
||||
"wanna": "want to",
|
||||
"gotta": "got to",
|
||||
"pokemons": "pokemon",
|
||||
"pokmons": "pokemon",
|
||||
"bcz": "because",
|
||||
}
|
||||
|
||||
_NLTK_RESOURCES = [
|
||||
"punkt",
|
||||
"punkt_tab",
|
||||
"stopwords",
|
||||
"wordnet",
|
||||
"averaged_perceptron_tagger",
|
||||
"averaged_perceptron_tagger_eng",
|
||||
]
|
||||
|
||||
|
||||
def _import_nltk() -> Any:
|
||||
"""Import NLTK lazily so this module can be imported before deps are installed."""
|
||||
try:
|
||||
import nltk # type: ignore
|
||||
except ModuleNotFoundError as exc:
|
||||
raise RuntimeError(
|
||||
"NLTK is not installed. Install project dependencies with: pip install -r requirements.txt"
|
||||
) from exc
|
||||
return nltk
|
||||
|
||||
|
||||
def ensure_nltk_resources(quiet: bool = True) -> None:
|
||||
"""Download required NLTK resources if missing.
|
||||
|
||||
Safe to call at app startup (including inside Streamlit).
|
||||
"""
|
||||
nltk = _import_nltk()
|
||||
for resource in _NLTK_RESOURCES:
|
||||
try:
|
||||
nltk.download(resource, quiet=quiet)
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc
|
||||
|
||||
|
||||
def remove_punctuation(text: str) -> str:
|
||||
mapping_table = text.maketrans("", "", string.punctuation)
|
||||
return text.translate(mapping_table)
|
||||
|
||||
|
||||
def remove_special_chars(text: str) -> str:
|
||||
text = text.encode("ascii", "ignore").decode("ascii")
|
||||
text = re.sub(r"[^a-zA-Z\s]", " ", text)
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def remove_short_words(text: str, min_len: int = 3) -> str:
|
||||
return " ".join(word for word in text.split() if len(word) >= min_len)
|
||||
|
||||
|
||||
def remove_alphanum_words(text: str) -> str:
|
||||
words = text.split()
|
||||
cleaned = [
|
||||
word
|
||||
for word in words
|
||||
if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word))
|
||||
]
|
||||
return " ".join(cleaned)
|
||||
|
||||
|
||||
def standardize(text: str, lookup: Dict[str, str] | None = None) -> str:
|
||||
mapping = lookup or SLANG_LOOKUP
|
||||
return " ".join(mapping.get(word, word) for word in text.split())
|
||||
|
||||
|
||||
def _get_wordnet_pos(treebank_tag: str) -> str:
|
||||
nltk = _import_nltk()
|
||||
wordnet = nltk.corpus.wordnet
|
||||
if treebank_tag.startswith("J"):
|
||||
return wordnet.ADJ
|
||||
if treebank_tag.startswith("V"):
|
||||
return wordnet.VERB
|
||||
if treebank_tag.startswith("N"):
|
||||
return wordnet.NOUN
|
||||
if treebank_tag.startswith("R"):
|
||||
return wordnet.ADV
|
||||
return wordnet.NOUN
|
||||
|
||||
|
||||
def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]:
|
||||
"""Run the full cleaning pipeline and return intermediate + final outputs.
|
||||
|
||||
Returns a dictionary so a UI can display each stage if desired.
|
||||
"""
|
||||
if not isinstance(raw_text, str):
|
||||
raise TypeError("raw_text must be a string")
|
||||
|
||||
nltk = _import_nltk()
|
||||
pos_tag = nltk.pos_tag
|
||||
word_tokenize = nltk.word_tokenize
|
||||
stopwords = nltk.corpus.stopwords
|
||||
WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer
|
||||
|
||||
ensure_nltk_resources(quiet=True)
|
||||
|
||||
text = raw_text.lower()
|
||||
text = remove_punctuation(text)
|
||||
text = remove_alphanum_words(text)
|
||||
text = remove_special_chars(text)
|
||||
noise_removed = remove_short_words(text, min_len=min_len)
|
||||
|
||||
standardized = standardize(noise_removed)
|
||||
|
||||
tokens = word_tokenize(standardized)
|
||||
|
||||
stop_words = set(stopwords.words("english"))
|
||||
tokens_no_stopwords = [token for token in tokens if token not in stop_words]
|
||||
|
||||
lem = WordNetLemmatizer()
|
||||
pos_tags = pos_tag(tokens_no_stopwords)
|
||||
lemmas = [
|
||||
lem.lemmatize(token, _get_wordnet_pos(tag))
|
||||
for token, tag in pos_tags
|
||||
]
|
||||
|
||||
clean_text = " ".join(lemmas)
|
||||
|
||||
return {
|
||||
"raw_text": raw_text,
|
||||
"noise_removed": noise_removed,
|
||||
"standardized": standardized,
|
||||
"tokens": tokens,
|
||||
"tokens_no_stopwords": tokens_no_stopwords,
|
||||
"lemmas": lemmas,
|
||||
"clean_text": clean_text,
|
||||
}
|
||||
|
||||
|
||||
def get_clean_text(raw_text: str, min_len: int = 3) -> str:
|
||||
"""Small helper for app code that only needs the final cleaned text."""
|
||||
return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]
|
||||
Reference in New Issue
Block a user