From 584b2e07b4618928db0c485a848d6a69595182ac Mon Sep 17 00:00:00 2001 From: Louis Labeyrie Date: Thu, 19 Mar 2026 18:16:20 +0100 Subject: [PATCH] first commit --- CLAUDE.md | 67 ++ app.py | 130 +++ card_generator_adapter.py | 107 +++ .../.ipynb_checkpoints/README-checkpoint.md | 189 +++++ .../example_usage-checkpoint.py | 36 + .../infer_json_usage-checkpoint.py | 0 .../json_inference-checkpoint.py | 0 .../json_template_example-checkpoint.json | 0 .../keyword_extractor-checkpoint.py | 137 ++++ .../requirements-checkpoint.txt | 0 .../test_json_inference-checkpoint.py | 0 .../test_keyword_extractor-checkpoint.py | 88 +++ clean-text-to-keywords/README.md | 189 +++++ .../json_inference.cpython-312.pyc | Bin 0 -> 13445 bytes .../keyword_extractor.cpython-312.pyc | Bin 0 -> 13250 bytes .../test_json_inference.cpython-312.pyc | Bin 0 -> 5126 bytes .../test_keyword_extractor.cpython-312.pyc | Bin 0 -> 7374 bytes clean-text-to-keywords/example_usage.py | 36 + clean-text-to-keywords/infer_json_usage.py | 111 +++ clean-text-to-keywords/json_inference.py | 398 ++++++++++ .../json_template_example.json | 35 + clean-text-to-keywords/keyword_extractor.py | 248 ++++++ clean-text-to-keywords/requirements.txt | 2 + clean-text-to-keywords/test_json_inference.py | 143 ++++ .../test_keyword_extractor.py | 166 ++++ fetch_card.py | 146 ++++ pokeball.png | Bin 0 -> 3692 bytes pokemon_card_training_2.ipynb | 742 ++++++++++++++++++ prompt_to_card_pipeline.py | 346 ++++++++ .../pokemon_text_cleaning-checkpoint.ipynb | 298 +++++++ .../text_cleaning_pipeline-checkpoint.py | 158 ++++ .../text_cleaning_pipeline.cpython-312.pyc | Bin 0 -> 7134 bytes text-cleaner/pokemon_text_cleaning.ipynb | 451 +++++++++++ text-cleaner/text_cleaning_pipeline.py | 158 ++++ 34 files changed, 4381 insertions(+) create mode 100644 CLAUDE.md create mode 100644 app.py create mode 100644 card_generator_adapter.py create mode 100644 clean-text-to-keywords/.ipynb_checkpoints/README-checkpoint.md create mode 100644 clean-text-to-keywords/.ipynb_checkpoints/example_usage-checkpoint.py create mode 100644 clean-text-to-keywords/.ipynb_checkpoints/infer_json_usage-checkpoint.py create mode 100644 clean-text-to-keywords/.ipynb_checkpoints/json_inference-checkpoint.py create mode 100644 clean-text-to-keywords/.ipynb_checkpoints/json_template_example-checkpoint.json create mode 100644 clean-text-to-keywords/.ipynb_checkpoints/keyword_extractor-checkpoint.py create mode 100644 clean-text-to-keywords/.ipynb_checkpoints/requirements-checkpoint.txt create mode 100644 clean-text-to-keywords/.ipynb_checkpoints/test_json_inference-checkpoint.py create mode 100644 clean-text-to-keywords/.ipynb_checkpoints/test_keyword_extractor-checkpoint.py create mode 100644 clean-text-to-keywords/README.md create mode 100644 clean-text-to-keywords/__pycache__/json_inference.cpython-312.pyc create mode 100644 clean-text-to-keywords/__pycache__/keyword_extractor.cpython-312.pyc create mode 100644 clean-text-to-keywords/__pycache__/test_json_inference.cpython-312.pyc create mode 100644 clean-text-to-keywords/__pycache__/test_keyword_extractor.cpython-312.pyc create mode 100644 clean-text-to-keywords/example_usage.py create mode 100644 clean-text-to-keywords/infer_json_usage.py create mode 100644 clean-text-to-keywords/json_inference.py create mode 100644 clean-text-to-keywords/json_template_example.json create mode 100644 clean-text-to-keywords/keyword_extractor.py create mode 100644 clean-text-to-keywords/requirements.txt create mode 100644 clean-text-to-keywords/test_json_inference.py create mode 100644 clean-text-to-keywords/test_keyword_extractor.py create mode 100644 fetch_card.py create mode 100644 pokeball.png create mode 100644 pokemon_card_training_2.ipynb create mode 100644 prompt_to_card_pipeline.py create mode 100644 text-cleaner/.ipynb_checkpoints/pokemon_text_cleaning-checkpoint.ipynb create mode 100644 text-cleaner/.ipynb_checkpoints/text_cleaning_pipeline-checkpoint.py create mode 100644 text-cleaner/__pycache__/text_cleaning_pipeline.cpython-312.pyc create mode 100644 text-cleaner/pokemon_text_cleaning.ipynb create mode 100644 text-cleaner/text_cleaning_pipeline.py diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..f342a1a --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,67 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Juicepyter is a Pokémon card generator pipeline that takes a natural language description, cleans it, extracts structured JSON metadata, and generates a card image using a LoRA-finetuned Stable Diffusion model. A Streamlit UI (`app.py`) ties it all together. + +## Architecture — Three-Stage Pipeline + +The pipeline (`prompt_to_card_pipeline.py`) orchestrates three stages: + +1. **Text cleaning** (`text-cleaner/text_cleaning_pipeline.py`): NLTK-based pipeline — lowercasing, punctuation/slang removal, stopword filtering, POS-aware lemmatization. Entry point: `get_clean_text(raw_text) -> str`. + +2. **Keyword extraction + JSON inference** (`clean-text-to-keywords/`): spaCy + YAKE keyword extraction (`keyword_extractor.py`) → rule-based JSON inference (`json_inference.py`) that populates a TCG-style card template. CLI: `infer_json_usage.py`. No LLM calls — deterministic and rule-based. + +3. **Card image generation** (`card_generator_adapter.py`): Loads `runwayml/stable-diffusion-v1-5` with a LoRA adapter (PEFT) from `pokemon_card_lora/`, converts metadata to a SD prompt via `metadata_to_conditioning()`, runs inference. The generator module is pluggable via `--generator-module`. + +`fetch_card.py` is a standalone data collection script that downloads real Pokémon TCG card images with embedded metadata using the TCGdex SDK. + +## Commands + +### Run the Streamlit app +```bash +streamlit run app.py +``` + +### Run the full pipeline CLI +```bash +python prompt_to_card_pipeline.py "description text" \ + --text-cleaner-path text-cleaner/text_cleaning_pipeline.py \ + --infer-script-path clean-text-to-keywords/infer_json_usage.py \ + --checkpoint pokemon_card_lora \ + --template clean-text-to-keywords/json_template_example.json \ + --generator-module card_generator_adapter.py \ + --device cpu \ + --save-path generated_card.png \ + --print-json +``` + +### Run keyword extraction + JSON inference only +```bash +cd clean-text-to-keywords +python infer_json_usage.py --template json_template_example.json "your pokemon description" +``` + +### Tests +```bash +cd clean-text-to-keywords +python -m unittest -q +``` + +## Dependencies + +- **text-cleaner**: `nltk` (punkt, stopwords, wordnet, averaged_perceptron_tagger) +- **clean-text-to-keywords**: `spacy>=3.7.0`, `yake>=0.4.2`, spaCy model `en_core_web_sm` +- **card generation**: `diffusers`, `torch`, `peft`, `transformers`, `accelerate`, `safetensors` +- **app**: `streamlit`, `Pillow` +- **fetch_card**: `tcgdexsdk`, `Pillow` + +Python 3.13 or lower recommended (spaCy compatibility). + +## Key Design Decisions + +- The generator module pattern is pluggable: any module with `build_pipeline(checkpoint_path, device)` and optionally `metadata_to_conditioning(meta)` can be swapped in via `--generator-module`. +- The JSON inference stage preserves non-empty fields in the provided template — only empty fields get populated. +- The LoRA base model is `runwayml/stable-diffusion-v1-5` with PEFT adapter weights in `pokemon_card_lora/`. diff --git a/app.py b/app.py new file mode 100644 index 0000000..76c50be --- /dev/null +++ b/app.py @@ -0,0 +1,130 @@ +import streamlit as st +import subprocess +import sys +import shlex +from pathlib import Path +from PIL import Image + +APP_DIR = Path(__file__).resolve().parent +PIPELINE_SCRIPT = APP_DIR / "prompt_to_card_pipeline.py" +TEXT_CLEANER_PATH = APP_DIR / "text-cleaner" / "text_cleaning_pipeline.py" +INFER_SCRIPT_PATH = APP_DIR / "clean-text-to-keywords" / "infer_json_usage.py" +CHECKPOINT_PATH = APP_DIR / "pokemon_card_lora" / "training_history.pt" +TEMPLATE_PATH = APP_DIR / "clean-text-to-keywords" / "json_template_example.json" +IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".webp", ".bmp") + + +def _extract_image_from_stdout(stdout: str) -> Path | None: + for line in reversed(stdout.splitlines()): + text = line.strip().strip("\"'") + if not text: + continue + + candidate = Path(text) + if not candidate.is_absolute(): + candidate = APP_DIR / candidate + + if candidate.suffix.lower() in IMAGE_EXTENSIONS and candidate.exists(): + return candidate + + return None + + + +def run_prompt_pipeline(prompt_text: str) -> tuple[Path | None, str, list[str]]: + cmd = [ + "python prompt_to_card_pipeline.py ", + prompt_text, + "--text-cleaner-path text-cleaner/text_cleaning_pipeline.py --infer-script-path clean-text-to-keywords/infer_json_usage.py --checkpoint pokemon_card_lora --template clean-text-to-keywords/json_template_example.json --generator-module card_generator_adapter.py --device cuda --save-path generated_card.png --print-json" + + ] + + result = subprocess.run( + cmd, + cwd=APP_DIR, + capture_output=True, + text=True, + check=False, + ) + + full_output = (result.stdout or "") + ("\n" + result.stderr if result.stderr else "") + + if result.returncode != 0: + return None, full_output.strip() or "Erreur inconnue pendant le pipeline.", cmd + + image_path = _extract_image_from_stdout(result.stdout or "") + return image_path, full_output.strip(), cmd + +# ------------------------------------------------------------------ # +# Configuration # +# ------------------------------------------------------------------ # + +st.set_page_config( + page_title="Générateur de Carte Pokémon", + page_icon=Image.open(Path(__file__).with_name("pokeball.png")), + layout="centered", +) + +logo_col, title_col = st.columns([1, 6], vertical_alignment="center") + +with logo_col: + st.image(Image.open(Path(__file__).with_name("pokeball.png")), width=72) + +with title_col: + st.title("Générateur de Carte Pokémon") +st.markdown("Décrivez votre Pokémon en langage naturel et laissez la magie opérer !") + +# ------------------------------------------------------------------ # +# Saisie utilisateur # +# ------------------------------------------------------------------ # + +raw_text = st.text_area( + label="Description de votre Pokémon", + placeholder=( + "Ex: My pokemon is called Pyrokar! Its a huge fire dragon with massive " + "red wings and shoots flames from its mouth... super fast n aggressive >:(" + ), + height=180, +) + +st.markdown( + """ + + """, + unsafe_allow_html=True, +) + +generate = st.button(" Générer la carte", use_container_width=True) + +# ------------------------------------------------------------------ # +# Pipeline # +# ------------------------------------------------------------------ # + +if generate: + if not raw_text.strip(): + st.warning("Veuillez entrer une description avant de générer.") + else: + with st.spinner("Génération de la carte Pokémon..."): + image, logs = run_prompt_pipeline(raw_text) + + if image is not None: + st.image(image, caption="Carte Pokémon générée", width="stretch") + if logs: + with st.expander("Logs pipeline"): + st.code(logs) + else: + st.error("Aucune image générée détectée. Vérifiez les chemins du pipeline.") + if logs: + with st.expander("Logs pipeline"): + st.code(logs) diff --git a/card_generator_adapter.py b/card_generator_adapter.py new file mode 100644 index 0000000..c88271e --- /dev/null +++ b/card_generator_adapter.py @@ -0,0 +1,107 @@ +"""Adapter to load the LoRA checkpoint and define conditioning logic. + +Customize this file to match your model architecture, then use: + --generator-module card_generator_adapter.py +""" + +from __future__ import annotations + +from typing import Any, Mapping + + +def build_pipeline(checkpoint_path: str, device: str): + """Load LoRA adapter and return a callable pipeline. + + The pipeline must accept: + pipeline(prompt_or_conditioning, num_inference_steps=30, guidance_scale=7.5) + + and return an object with .images attribute. + """ + from pathlib import Path + + checkpoint_input = Path(checkpoint_path).expanduser().resolve() + if checkpoint_input.is_dir(): + checkpoint_dir = checkpoint_input + elif checkpoint_input.exists(): + checkpoint_dir = checkpoint_input.parent + else: + raise FileNotFoundError(f"Checkpoint path not found: {checkpoint_input}") + + # Load base Stable Diffusion model + LoRA adapter (PEFT) + try: + from diffusers import StableDiffusionPipeline + import torch + except ImportError as e: + raise RuntimeError( + f"diffusers and torch required. Install: pip install diffusers torch " + f"(error: {e})" + ) + + # Load base model + model_id = "runwayml/stable-diffusion-v1-5" + pipe = StableDiffusionPipeline.from_pretrained( + model_id, + torch_dtype=torch.float16 if device == "cuda" else torch.float32, + ) + pipe = pipe.to(device) + + # Load LoRA weights from adapter_model.safetensors + adapter_path = checkpoint_dir / "adapter_model.safetensors" + if adapter_path.exists(): + try: + pipe.load_lora_weights(str(checkpoint_dir)) + except Exception as e: + message = str(e) + if "PEFT backend is required" in message: + raise RuntimeError( + "Failed to load LoRA: PEFT backend is missing. " + "Install required packages with: pip install peft transformers accelerate safetensors" + ) from e + raise RuntimeError( + f"Failed to load LoRA from {checkpoint_dir}: {e}\n" + "Ensure adapter_config.json and adapter_model.safetensors are present." + ) from e + else: + raise FileNotFoundError( + f"LoRA adapter not found at {adapter_path}. " + f"Expected: adapter_model.safetensors in {checkpoint_dir}" + ) + + return pipe + + +def metadata_to_conditioning(meta: Mapping[str, Any]) -> str: + """Convert metadata dict to a Stable Diffusion prompt. + + LoRA is trained on Pokemon cards, so describe it as such. + """ + name = str(meta.get("name", "Unknown Pokemon")) + pokemon_type = str(meta.get("type", "normal")).capitalize() + secondary = meta.get("secondary_type") + + hp = str(meta.get("hp", "60")) + + attacks = meta.get("attacks") or [] + attack_list = [] + if isinstance(attacks, list): + for atk in attacks: + if isinstance(atk, dict): + attack_list.append(str(atk.get("name", "")).lower()) + elif atk: + attack_list.append(str(atk).lower()) + + # Build a descriptive prompt for card generation + prompt = f"Pokemon trading card of {name}, {pokemon_type}-type Pokemon" + if secondary: + prompt += f"/{secondary.capitalize()}" + prompt += f", HP {hp}" + + if attack_list: + prompt += f", with attacks: {', '.join(attack_list[:2])}" + + description = meta.get("description", "").strip() + if description: + prompt += f". {description}" + + prompt += ". High quality illustration, official Pokemon card style." + return prompt diff --git a/clean-text-to-keywords/.ipynb_checkpoints/README-checkpoint.md b/clean-text-to-keywords/.ipynb_checkpoints/README-checkpoint.md new file mode 100644 index 0000000..1f6c72f --- /dev/null +++ b/clean-text-to-keywords/.ipynb_checkpoints/README-checkpoint.md @@ -0,0 +1,189 @@ +# Pokemon Text-to-JSON Pipeline + +This project converts free-form Pokemon description text into: + +1. A normalized keyword list +2. A populated Pokemon JSON object (from a blank/key-only template) + +The pipeline is deterministic and rule-based. + +## Architecture + +### Stage 1: Keyword Extraction + +File: `keyword_extractor.py` + +Input: raw text description + +Core logic: + +- spaCy tokenization and POS tagging +- POS filtering (`NOUN`, `ADJ`, `VERB`) +- stopword and punctuation removal +- lemma-based normalization +- domain synonym normalization (example: `flames -> fire`) +- optional YAKE relevance scoring +- conservative retention policy so detail is not over-pruned + +Output: ordered list of normalized keywords + +### Stage 2: JSON Inference + +File: `json_inference.py` + +Input: keyword list + optional JSON template + +Core logic: + +- infer primary/secondary type +- infer name candidate +- infer attacks, abilities, habitat, personality +- infer basic stats (`hp`, `attack`, `defense`, `speed`) +- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.) +- preserve already non-empty values in the provided template + +Output: inferred JSON profile + +### Stage 3: Orchestration CLI + +File: `infer_json_usage.py` + +This is the main entrypoint for end-to-end usage. + +Default behavior: + +1. prints extracted keyword list +2. prints inferred JSON + +## Project Structure + +- `keyword_extractor.py`: keyword extraction engine +- `json_inference.py`: keyword-to-JSON inference logic +- `infer_json_usage.py`: end-to-end CLI +- `example_usage.py`: keyword extraction only CLI +- `json_template_example.json`: sample blank/key-only template +- `test_keyword_extractor.py`: extraction tests +- `test_json_inference.py`: inference tests +- `requirements.txt`: Python dependencies + +## Requirements + +- Python 3.13 or lower is recommended for spaCy compatibility +- pip + +Dependencies in `requirements.txt`: + +- `spacy>=3.7.0` +- `yake>=0.4.2` + +## Setup + +1. Create and activate a virtual environment (recommended) + +```bash +python -m venv .venv +source .venv/bin/activate +``` + +2. Install dependencies + +```bash +pip install -r requirements.txt +``` + +3. Install spaCy English model + +```bash +python -m spacy download en_core_web_sm +``` + +## How To Run + +### A) Extract keywords only + +```bash +python example_usage.py "furret long slender agile creature with soft fur" +``` + +Output: JSON list of keywords. + +### B) End-to-end: text -> keywords -> JSON + +```bash +python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur" +``` + +Output order: + +1. keyword list +2. inferred JSON + +### C) End-to-end but JSON only + +```bash +python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur" +``` + +### D) Start from keywords directly + +```bash +python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance +``` + +Tip: If you pass `--keywords`, text extraction is skipped. + +## Template Behavior + +If `--template` is omitted, inference returns a full inferred profile object. + +If `--template` is provided: + +- empty fields are populated from inferred values +- non-empty fields are preserved + +Current sample template supports nested card-like data including: + +- `types` +- `attacks` with `cost`, `name`, `effect`, `damage` +- `weaknesses` with `type`, `value` +- `stage`, `retreat`, `legal` + +## Tests + +Run all tests: + +```bash +python -m unittest -q +``` + +## Troubleshooting + +### 1) spaCy model not found + +Error mentions `en_core_web_sm` not installed. + +Fix: + +```bash +python -m spacy download en_core_web_sm +``` + +### 2) spaCy import/runtime problems on very new Python versions + +Use Python 3.13 or lower and reinstall requirements. + +### 3) `--template` path errors + +Ensure `--template` points to a valid file path, for example: + +```bash +--template json_template_example.json +``` + +If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`. + +## Design Notes + +- deterministic and explainable (no LLM calls) +- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py` +- scoring and template fill rules are intentionally simple and stable for game-content generation diff --git a/clean-text-to-keywords/.ipynb_checkpoints/example_usage-checkpoint.py b/clean-text-to-keywords/.ipynb_checkpoints/example_usage-checkpoint.py new file mode 100644 index 0000000..61b274b --- /dev/null +++ b/clean-text-to-keywords/.ipynb_checkpoints/example_usage-checkpoint.py @@ -0,0 +1,36 @@ +import argparse +import json +from typing import Sequence + +from keyword_extractor import KeywordExtractor + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Extract normalized keywords from cleaned text.", + ) + parser.add_argument( + "text", + nargs="+", + help="Input text to process. Pass as one quoted string or multiple words.", + ) + parser.add_argument( + "--model", + default="en_core_web_sm", + help="spaCy model name (default: en_core_web_sm).", + ) + return parser + + +def main(argv: Sequence[str] | None = None) -> None: + parser = _build_parser() + args = parser.parse_args(argv) + + text = " ".join(args.text) + extractor = KeywordExtractor.from_default_model(model_name=args.model) + keywords = extractor.extract(text) + print(json.dumps(keywords)) + + +if __name__ == "__main__": + main() diff --git a/clean-text-to-keywords/.ipynb_checkpoints/infer_json_usage-checkpoint.py b/clean-text-to-keywords/.ipynb_checkpoints/infer_json_usage-checkpoint.py new file mode 100644 index 0000000..e69de29 diff --git a/clean-text-to-keywords/.ipynb_checkpoints/json_inference-checkpoint.py b/clean-text-to-keywords/.ipynb_checkpoints/json_inference-checkpoint.py new file mode 100644 index 0000000..e69de29 diff --git a/clean-text-to-keywords/.ipynb_checkpoints/json_template_example-checkpoint.json b/clean-text-to-keywords/.ipynb_checkpoints/json_template_example-checkpoint.json new file mode 100644 index 0000000..e69de29 diff --git a/clean-text-to-keywords/.ipynb_checkpoints/keyword_extractor-checkpoint.py b/clean-text-to-keywords/.ipynb_checkpoints/keyword_extractor-checkpoint.py new file mode 100644 index 0000000..3df689e --- /dev/null +++ b/clean-text-to-keywords/.ipynb_checkpoints/keyword_extractor-checkpoint.py @@ -0,0 +1,137 @@ +"""Rule-based keyword extraction and normalization for Pokemon card generation.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple + +# Canonical concept -> synonym list +from typing import Dict, List + +DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = { + "normal": ["basic", "common", "regular", "plain"], + "fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"], + "water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"], + "grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"], + "flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"], + "fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"], + "poison": ["toxic", "venom", "acid", "poisonous", "toxin"], + "electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"], + "ground": ["earth", "soil", "sand", "mud", "quake", "dust"], + "rock": ["stone", "boulder", "crystal", "rocky", "pebble"], + "psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"], + "ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"], + "bug": ["insect", "ant", "beetle", "spider", "crawler"], + "ghost": ["spirit", "phantom", "haunt", "shadow", "specter"], + "steel": ["metal", "iron", "armor", "blade", "alloy"], + "dragon": ["drake", "wyrm", "serpent", "legendary"], + "dark": ["shadow", "evil", "night", "doom", "darkness"], + "fairy": ["magic", "magical", "sparkle", "light", "charm"], +} + +DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB") + + +def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]: + """Build synonym -> canonical mapping for O(1) normalization lookup.""" + inverse: Dict[str, str] = {} + for canonical, synonyms in normalization_map.items(): + canonical_normalized = canonical.strip().lower() + inverse[canonical_normalized] = canonical_normalized + for synonym in synonyms: + synonym_normalized = synonym.strip().lower() + if synonym_normalized: + inverse[synonym_normalized] = canonical_normalized + return inverse + + +def _deduplicate_preserve_order(items: Iterable[str]) -> List[str]: + seen: Set[str] = set() + output: List[str] = [] + for item in items: + if item not in seen: + seen.add(item) + output.append(item) + return output + + +@dataclass +class KeywordExtractor: + """Deterministic spaCy + rule-based keyword extraction pipeline.""" + + nlp: Any + normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP) + allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS) + + def __post_init__(self) -> None: + self._normalization_lookup = _invert_normalization_map(self.normalization_map) + self._allowed_pos_set = set(self.allowed_pos) + + @classmethod + def from_default_model( + cls, + model_name: str = "en_core_web_sm", + normalization_map: Optional[Mapping[str, Iterable[str]]] = None, + allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS, + ) -> "KeywordExtractor": + """Initialize extractor with a spaCy English pipeline.""" + try: + import spacy + + nlp = spacy.load(model_name) + except OSError as exc: + raise OSError( + f"spaCy model '{model_name}' is not installed. " + "Run: python -m spacy download en_core_web_sm" + ) from exc + except Exception as exc: + raise RuntimeError( + "spaCy could not be loaded in this Python environment. " + "Try Python 3.13 or lower, then install spaCy and en_core_web_sm." + ) from exc + + return cls( + nlp=nlp, + normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP, + allowed_pos=allowed_pos, + ) + + def extract(self, text: str) -> List[str]: + """Extract and normalize keywords from already-cleaned text.""" + if not text or not text.strip(): + return [] + + doc = self.nlp(text) + + # Step 1: POS filtering + base normalization to lowercase lemmas/tokens. + raw_keywords: List[str] = [] + for token in doc: + if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set: + continue + + # Use lemma where possible to collapse inflections. + base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip() + if base: + raw_keywords.append(base) + + # Step 2: Deduplicate before domain normalization (as requested in README). + deduplicated = _deduplicate_preserve_order(raw_keywords) + + # Step 3: Map variants/synonyms to canonical concepts. + normalized = [self._normalize_keyword(keyword) for keyword in deduplicated] + + # Step 4: Deduplicate again, since multiple words can map to one concept. + return _deduplicate_preserve_order(normalized) + + def _normalize_keyword(self, keyword: str) -> str: + keyword_lower = keyword.lower() + return self._normalization_lookup.get(keyword_lower, keyword_lower) + + +def extract_keywords( + text: str, + extractor: Optional[KeywordExtractor] = None, +) -> List[str]: + """Convenience API to extract keywords with default extractor config.""" + active_extractor = extractor or KeywordExtractor.from_default_model() + return active_extractor.extract(text) diff --git a/clean-text-to-keywords/.ipynb_checkpoints/requirements-checkpoint.txt b/clean-text-to-keywords/.ipynb_checkpoints/requirements-checkpoint.txt new file mode 100644 index 0000000..e69de29 diff --git a/clean-text-to-keywords/.ipynb_checkpoints/test_json_inference-checkpoint.py b/clean-text-to-keywords/.ipynb_checkpoints/test_json_inference-checkpoint.py new file mode 100644 index 0000000..e69de29 diff --git a/clean-text-to-keywords/.ipynb_checkpoints/test_keyword_extractor-checkpoint.py b/clean-text-to-keywords/.ipynb_checkpoints/test_keyword_extractor-checkpoint.py new file mode 100644 index 0000000..498eeb9 --- /dev/null +++ b/clean-text-to-keywords/.ipynb_checkpoints/test_keyword_extractor-checkpoint.py @@ -0,0 +1,88 @@ +import unittest + +from keyword_extractor import KeywordExtractor + + +class FakeToken: + def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None: + self.text = text + self.pos_ = pos + self.lemma_ = lemma + self.is_stop = is_stop + self.is_punct = not any(ch.isalnum() for ch in text) + + +class FakeNLP: + def __init__(self, tag_map, stopwords) -> None: + self.tag_map = tag_map + self.stopwords = stopwords + + def __call__(self, text: str): + tokens = [] + for raw in text.split(): + token_text = raw.strip() + lowered = token_text.lower() + tokens.append( + FakeToken( + text=token_text, + pos=self.tag_map.get(lowered, "NOUN"), + lemma=lowered, + is_stop=lowered in self.stopwords, + ) + ) + return tokens + + +class KeywordExtractorTests(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + tag_map = { + "fiery": "ADJ", + "dragon": "NOUN", + "attack": "VERB", + "explosive": "ADJ", + "flames": "NOUN", + "burning": "ADJ", + "creature": "NOUN", + "with": "ADP", + "blaze": "NOUN", + "power": "NOUN", + "electric": "ADJ", + "mouse": "NOUN", + "using": "VERB", + "thunder": "NOUN", + "shock": "NOUN", + "a": "DET", + "very": "ADV", + "strong": "ADJ", + "and": "CCONJ", + "dangerous": "ADJ", + } + + stopwords = {"a", "very", "and", "with"} + cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords) + cls.extractor = KeywordExtractor(nlp=cls.nlp) + + def test_readme_main_example(self) -> None: + text = "fiery dragon attack explosive flames" + result = self.extractor.extract(text) + self.assertEqual(result, ["fire", "dragon", "attack", "explosion"]) + + def test_synonym_normalization(self) -> None: + text = "burning creature with blaze power" + result = self.extractor.extract(text) + self.assertEqual(result, ["fire", "creature", "power"]) + + def test_mixed_types(self) -> None: + text = "electric mouse using thunder shock" + result = self.extractor.extract(text) + self.assertEqual(result, ["electric", "mouse", "using"]) + + def test_noise_input(self) -> None: + text = "a very very strong and dangerous creature" + result = self.extractor.extract(text) + self.assertEqual(result, ["strong", "dangerous", "creature"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/clean-text-to-keywords/README.md b/clean-text-to-keywords/README.md new file mode 100644 index 0000000..1f6c72f --- /dev/null +++ b/clean-text-to-keywords/README.md @@ -0,0 +1,189 @@ +# Pokemon Text-to-JSON Pipeline + +This project converts free-form Pokemon description text into: + +1. A normalized keyword list +2. A populated Pokemon JSON object (from a blank/key-only template) + +The pipeline is deterministic and rule-based. + +## Architecture + +### Stage 1: Keyword Extraction + +File: `keyword_extractor.py` + +Input: raw text description + +Core logic: + +- spaCy tokenization and POS tagging +- POS filtering (`NOUN`, `ADJ`, `VERB`) +- stopword and punctuation removal +- lemma-based normalization +- domain synonym normalization (example: `flames -> fire`) +- optional YAKE relevance scoring +- conservative retention policy so detail is not over-pruned + +Output: ordered list of normalized keywords + +### Stage 2: JSON Inference + +File: `json_inference.py` + +Input: keyword list + optional JSON template + +Core logic: + +- infer primary/secondary type +- infer name candidate +- infer attacks, abilities, habitat, personality +- infer basic stats (`hp`, `attack`, `defense`, `speed`) +- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.) +- preserve already non-empty values in the provided template + +Output: inferred JSON profile + +### Stage 3: Orchestration CLI + +File: `infer_json_usage.py` + +This is the main entrypoint for end-to-end usage. + +Default behavior: + +1. prints extracted keyword list +2. prints inferred JSON + +## Project Structure + +- `keyword_extractor.py`: keyword extraction engine +- `json_inference.py`: keyword-to-JSON inference logic +- `infer_json_usage.py`: end-to-end CLI +- `example_usage.py`: keyword extraction only CLI +- `json_template_example.json`: sample blank/key-only template +- `test_keyword_extractor.py`: extraction tests +- `test_json_inference.py`: inference tests +- `requirements.txt`: Python dependencies + +## Requirements + +- Python 3.13 or lower is recommended for spaCy compatibility +- pip + +Dependencies in `requirements.txt`: + +- `spacy>=3.7.0` +- `yake>=0.4.2` + +## Setup + +1. Create and activate a virtual environment (recommended) + +```bash +python -m venv .venv +source .venv/bin/activate +``` + +2. Install dependencies + +```bash +pip install -r requirements.txt +``` + +3. Install spaCy English model + +```bash +python -m spacy download en_core_web_sm +``` + +## How To Run + +### A) Extract keywords only + +```bash +python example_usage.py "furret long slender agile creature with soft fur" +``` + +Output: JSON list of keywords. + +### B) End-to-end: text -> keywords -> JSON + +```bash +python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur" +``` + +Output order: + +1. keyword list +2. inferred JSON + +### C) End-to-end but JSON only + +```bash +python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur" +``` + +### D) Start from keywords directly + +```bash +python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance +``` + +Tip: If you pass `--keywords`, text extraction is skipped. + +## Template Behavior + +If `--template` is omitted, inference returns a full inferred profile object. + +If `--template` is provided: + +- empty fields are populated from inferred values +- non-empty fields are preserved + +Current sample template supports nested card-like data including: + +- `types` +- `attacks` with `cost`, `name`, `effect`, `damage` +- `weaknesses` with `type`, `value` +- `stage`, `retreat`, `legal` + +## Tests + +Run all tests: + +```bash +python -m unittest -q +``` + +## Troubleshooting + +### 1) spaCy model not found + +Error mentions `en_core_web_sm` not installed. + +Fix: + +```bash +python -m spacy download en_core_web_sm +``` + +### 2) spaCy import/runtime problems on very new Python versions + +Use Python 3.13 or lower and reinstall requirements. + +### 3) `--template` path errors + +Ensure `--template` points to a valid file path, for example: + +```bash +--template json_template_example.json +``` + +If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`. + +## Design Notes + +- deterministic and explainable (no LLM calls) +- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py` +- scoring and template fill rules are intentionally simple and stable for game-content generation diff --git a/clean-text-to-keywords/__pycache__/json_inference.cpython-312.pyc b/clean-text-to-keywords/__pycache__/json_inference.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..460302a1c3bdac32c1e1024612c11efba9dfcbb6 GIT binary patch literal 13445 zcmch74R9PscHYeH><_TNz~cXRNq_*BBna_O{1FKfBmoj42ucKxk`PI&C1wFEu|Lqv z03e`64!N8w;7KknZN*yqXd7qoa$uxHRD7 zIOUU4mE?OpJBtMYN_JI}0rvHqp6=J(uV24@?{)uMhog|g)4TZ7(5pK+?myFu{x~&+ zUtQ)o?mbTA`Z=QeY zZkz1#U7R}59UTlx-WOsc!ALCH8V-#Fz0dcZ@AZxc!ec?%J1E5>-r!Y53JfSg(K`~H zoQO%Hyz3Fkh>urofoL?Q1e8!LD&y5E27{vmvC&DNSv7VE?Z#iFWtFg!`r%|lXbEGnwzXiSO(!m4>R z7LxJZGAd6F42K3(YcL!fP$a?zLqo#~Dz!UUN915YHOeCxz(8OeU5Ey$(ZNtKEUKp1 zKrj$hO;Ttajhf_%Kx7nc2j!ropzWAMWzEW1G>WcS#$(}uKs2UWBj}3~2u0iN)Pit1 z22!9Nj;N-&YF}^B!s@BO$iJ!4+WzN3Rwomq)=>3#vsCh$-%L( zY8@C31|`A@g3+Kf6jVY3?Jlj2kb=IWB?T>o0~4w+5&|m)bOQ}s4hOVBVdSc5bSyeB zjL{4;X3LR)Jgk}niV{Et^Pm(APN5Srm^Fe_O8aPVq~gaqRd$D{y>g_)?D#zRUV zEOirZ84d=*%J8Jr!`@^i5D7&CQZL@xtpL(N0*443A#j?&Q3A&Z94BxBK%*=S2P8~r zK#Ig@CSzf6+@!?DhA?(aBgv{+9;R8(mKTXF!XOwYL2aUIZ;pYZ(iw)!6QM!HC#Z(u zQPz-Z5rc!lDCB~D7ovekP_>V0Yt2t@?MA97D3hakAF^*!C=!q+{q&|4GZDH+Es(*m zDAvcM>dc{9F@bd>0(Q#i_edxlhMZ_~2qZShI@8WdnX#lpimX~kv5p|fvRV+h914e& z5YQrdG&m3ng#EcJtq0)A5at?_fDc&psx25D4@t3T1j3mu5)CM0m{!|pP{LXZfa0WT zQ-Tq^W56iJnis-YCaOgqi_jod8&=PN6dENxqS^+-f$^B+SD<%PtFCfLYKLUji1aFr z)2vB?>d;a@7I##UHO3nQ%82wDl`>)qH1-?iQPJm+UPrF<27w@fVFFI5q}Z!>WjoQVRv5OsNHf_$Cj4C%Q~2&|SzH1OK4shG|8cV^|zRNJqz{QLGp9 zWH20#O-L2g#~K2(g-ay_stJ&Cm8jnm=^m+yKqY}P0;KUI8v)W9QW1et0&W5xfB_f+ zxyJ+yV#}}6nuB@3U&;TPEhZYi%f-1XB;HRvA7tN8aXR)vwtgSyGxmPPd)Ku_6!ubU zw3^RyJX(x%uVHA@e0_F#RgS45Evz85}FG7460$* zGnk@)q{s$P_je{?@>i@Mzl=tnT3GBsQ0}fiRSQE6HT55^L3^vh50l z+#jM%l(w;5qmy419D%Bac9Ytt>ikLwO4~mWkb}F9gk#Y5a{I2__xI7LOw;mTxnIq2 zi=N~2jfqq5oxgGZLCLn1=lGX0EtT(7)$r)5a0TKGIT*!)y3Io+vi+Gj=aqg3P_^Z? zCCwq{5Ac&~0cN<2&3XOcwS%B}XWLJ9e7xfiTbDe?)3)Oa!tqrUSU+#JU_g_y;j53-o#!CaGEJn`srAsR+nIIfRdM}<^7dlYL9V@4^xDI}=u85DA@ zI2965_QW||YDA+(ds6V3dM~P+M1AV=S=GXj9ExIpz_eg1S4h8O_Zn4&FsXGxB*Vm{ zjL`xz%0WdnhJ#V*_fT2RV%ts6 zlBG7|D!KmFwYT1$o-?FeTNkWbwNH(=gj>N6`xYv9FIo01yNVP0-=0doc{t+Q!fAu(dl6`I>89-YQ1?lmZaSHZn$!?ZEZ$BpCQ z#5=aQVE{&dA182ezK@fB3)Qj3!AQa8E)geG+u6=n{2e{r9etgBstF1zBBO*6OGLG> zV$hNWWCKM*SOD0N!Xc~&nMt=7k0!rPtdeXIKdMT91k!i#lWFe+qn$<9L)SuYk34YJ zr=0a^=a!k%%XPj7b#1A-wshU@$DF}bo3d8T797aeogXOPAPJ7yk$9c$jQky%V|bVPQF>r zGswxwIm7j5C6Xbgb9@H2Z>knuKhk2ho z-^EX+y%w|Usea74OpUV^%-y?@l&v9SF9Et#)|9d}Y4}sJm&&%MY}@tXJ5#ouz?W^# z*@?H0B)2Tus_E5Ll{}DkH7r^iR_f8rikGvOevK?7(W&>iA=+a|9@=P4#0~4@p)_uY z^W(f!gk1>o^W+mZ=<~>PN=`Gbn_buzSYK^C7v*Vf$xopq=24;*Nb^lxWxKu4DE%>T z80w>=Fq+v!AdhSp=r)*(vh=6GW$IyWrP;K3&_4ehUw%PFz6F4ZD6gF9WV0cdiq~di z-g0Lw)p8_bFS_1yttVN&WZyz89(((6vgqw&pV^C63Q+!wLe5^4u{y5Xuh|nXFEPzF z)3cG!ISZl(ZE9=oIzM0-;sy+)gJ@^Z=QX&5wRgktj!Pij#K#RXcBHsLG>{F`hgv*- z)BFVSMzxG-%Yf|zK3@8lD5JAPAm>%JX02CwJzE?aTe2UeKSR9?eln~NV$2rbFI`>9 zlKG2E`#Tp~JAXr1aCOa`0r<@3$+#*qHcz7C+H(s+*_V$koclB{)2jb9#}?r?g)PE5 z)lRFGsT@c^zL;&#E3?=3nlg>I%Y{w zHZF|+Z;~M`p!?v$&ycL+LYOi(a>1p^M9v+Q(kS;=_{k>#z=!JEneHqfnz9nnv}kLZ zKXBK#xZ`BTUi!dZld{(&U%GGKMyzNeR+J_#Bm%QX7laCp6(pPGa!J4hAM-hf#HOjOYX)XCSw$}EemLNM@Z^5@43EgH?7dAZkoFhGW#}R}t$vuF zSzR>4j3Md!C;;z49)OqZhYhQ2;yrwcGI9mLuQd0e=w#A@71*-acH-SFsiKo#G7a<( z3rhH3x0LXYT2S)Q904*ue1h~(k-+jDi>Ss(;3_~U>NCNSg>9DIK&Wqs8$<#MLQJy6 zv?1wk>?Gdfy4SJRN9ix{?Qigt4*~pYhRd3{GiR2aMd_lGa|f4-PNtnFXHI{Py+e9; zZ$eI3fEXCn0-;l8Z%FnAyi>KUw8!efRc~xi2}Zqx6q59YqTVUbE^oI&gb;ZR zPkP5-6MLtsyoir^5rvxo2|Xa=WpUZB_+8k~Zh{9six$SX4E%`*1|qU7W~W6@>>3F~ zk>Iz=B=+d+*SVL&;Z`Jk1M-&(HKcw7F)R66Qoa5nX8PxYvBIa)sm1@->;7=H$_ZmJ|1m z+}pMA!iy_LzMzj^F&a(gjk=F)8aBx!sALp~{0Q~~iJvq8H)e4GxG9U9CoMi}?;{cl zw%I@GC+wF1kGcr^*93k+-~j+wcLqtmb64S)f{GLjDEHRz_9OBY6s7-;k`}C2 zxd8~&o5h|mekY!Bxd}6T=dJaa&8`iHhM+&R`?KMYB|}k?HQ!fc1Db+V>v{$n2z5yV zk)4SR+@xzHXf9G4>=8|C+79w*L@CtL%IcK#6!L26eN=~fj5X4KBaAWR-xH<+K>hy* z!v7=TLQZZ-+rW!MkoKxXd%kK7h*!oCXBVGv*y{aRNfQH+z)(M!CB+oTD$<9G4?X!SuHpjLl`m)R}{iAhuTO2Nb+T*)UO4u zncLz=aGyn5rT>9q6*M+dz$74^#MMPhea2O_Y%5E|)3(M1p>d;B&QW59i z0dCFj=Vso7nDhH|ttDZf6M3?KX`viJN>cT=qLte{*U@6OG-w$ZE$oY+)%%V8%4yLY zt$pa1k@_`lSle?qj|7Uj+G{?uF29m(ZI{m1>R2662CR=z1?Z#ljjafJ-`4!_Tr1xK zHG%cks9+1%o_MZbP&JSwxp>V#7tP;=#jw%Bkl@ve7NRu7Ph%h1bQZ08Po$E#a3vSZ z(%Z#c@}iY!=;e4WP6|(8<1Jw0t^dY3DbQ={sbmrhL8X9Eu`w#GPni*$UPrWpw^a(o zg4v^ZbPRoT48{=JCqDDW^ATIaAyj zw`=20%XFdg%iQR2yOzvP}h#xexGs;SNAX*97p~*;G^TXAP!mwl&RuV zII5Ptnt9}%2r0u}>2s=K*0R{aA`a$Zd#>{_|8KBW`W5y1OMvXrR-Rw^cVOnF^~`+c zjZgRexc*+rgWX-JXWsZy`k$chbF&FljktIq-J({1NZ>~VC@!oT1K}`>F)*td*E?|1 z>7c;0|E123v%Q^tebPLY!>RvR1c$^pY@4IXWIi%G%jd?4=a@1&hSOkn(niOw*5Og@ zL7(tpx8J##-hOzg>N!MklBaI8 z&svvT_GIge>AfdE>04^)epgN&e0TcOtDvA5ir5OG}NHGIdRJ&_V$fa7HT!HTNZ zH9aUlnX*>Y@OTxqeDM9G=C}Guy4t;KhwB7 z(V1+zac=F)OX-IFOP*&l+kE)ae5053deKsKb1FL-ucWsh`=sQ4eOIQkey$)@xqa4| zscx9-PF3$(uvTO$ymMu#ik4Z&@|NxM?ddItSGl5s3w+96J9{!wv+Su&hCe*D0s(nE%n5u0OQoI-J>Y^zO*r*FN2{;OjxDms9ShhpzJEk(8@> z{?!MqM^mjw?+&M1Pk##R6tg~x4yBw zv*TWCdgluZJ6=rm%$21}TE1u?UaS;w-qZYorD|iCf$Ui3fmK1dKXLZ3PYdEA^|Hv)V~Uo_&z>%HZ*nm&l$q&u3atC?fh~t|`whSfgNGvk3Ai zSn|#h*UDPgY`yMv?DM!4Hfdqp7Pn(m<+++UYFeoevT;Vy zdegD)B&ncRiY?Ph-SL;@`@YgPN$1^&YbZ3ol#Lo=N?hA1k{xJwg( z8`k9}-~6`suiV-F2hrrVIsSKA?lxvTRhiN{g5_H?PIsobg7BiUOiA@}xo^Jlj_FR| zW9!}Kdxt(9N|#^yqS|RUuW(M2`OC+(oC}A5xBp@q+}N%mOxfqDeR*g@p_6&8%7!7LtL|2Qy@ktfNXmRCK#{el!)E|I5Jrj z2i?A`J19)&E0)}1gIjHm8*u2&zf+c{VoZ7upcc0xK*JU*E!!iY{LT&m?IP&LPfc~f zhW7^O5^*bwcI6fG*M4?}7ItttFuiuKNLe_hDAcg!D5yY?aKS-b6)IDhc{ zXu5X)OxJ(16~Qo?b$u>WC%fj(ED5a->l%MQ{M+I0M;~)0(=NK{Vw`<#xn%3y-gHUx zY(d6Wl01{LHL}YORmr3GZN5y^*11Ees#aJ}d%t&f#fEAtPMpNfoMYQ!FV29-fnoMK zU3bO3%_w`(Kaj9Vjp#02Z;XYJO>EHYd&<@elkWM-pg7%^uDrj-q~@!l>=pOCd?Kd` zy|HKzoLn0%Yo(b+W|p=PBU%V-_Pt59`2AvR!0%TJ{riBTz%le?-{7B2YlDbA-_v(N@bn z!hTNR&j`@^+FX)xFCFbz0`ElBsW9yLC0XvmplE@sPudGKRF>x#$Y?pT7EB-n8 zJ|GT_c>Xi)=wEVM|B|cxbMDv!?$~3&Q)HNNJl^gsF)VQI$6iMT(vrt@ZkF9>)3R$g zTMw@~FR89B)zR@qfyrr@DO_=L&XUCI3-(6UzE3S{S6v6yqHfi-SM@xndQN4Es*{%% z+>L7C4lKZA>4LL97I7<^_3yxZ7d6!|v_Sjodg;vW}_MtNw!9IIs#R%-7 zP&j*f#RLp{i^)FQzhWWWO8BLf0>TS9lPh6eu@P?POog-0t~dyHQhvdTi*Pq*a?B2| z6cJub_|S@n@DjqWtdtU7#+jV6<16KaS7>x939sTzHuRyI@EWb%TEgozI`xEmsm${$ zTL|B(m1!Wnk?@zk*am#YvAUDzJNQ+rg)dqyG4Y;NcM)H@+ERem3V||^C+fF>>8~QyOHZ*xn1J!N!@6rxCbSBS;;%@~iSz zVUKgMalIXoA&CNz-(JS4E$^Jkr7>#)S z5z!lo%3*&f827W1@u=)QAH61pQ5*21^@J3WWLDSxF^$;4t2Tcm5{kUtnvt@(zr zxoM0s9+agHGbjo9!@$8_iYkoF$mmEZ4Q&_;`Qs8JR|cJvk{@&hBjb`BiK>QKKPc`n zslu%PIvPa-l0TyA70Is}mV0dLxHN=7@>E?0DP&Xq6oQVV`RpDANa1B(&(z(5v80h>WnY;q9Fl0q~AVTnFN2+E+-;FrTupu@fuu_gVXP!v-j%Kix~rJx!B(G1Sc$zh<9 z3>oczBF**MBK$#Lmo+Y zclTA*qjm2a8ans%K)>(&xe?Vqjpf9KoFV=~JN_Ej#^>NN`9akht<&IXjEYWY1};B+ z?n3{_M4qXdj~4*$_jOn_AnV&{X5^{oK_{9Crm=e_l7QPw1x5d*h?EohnG| zwe?!vypRwQd_o{ulh6Yzr2V`hVfZKGGbW4*f2)OIH4@faodS33X@N`dBM=kuz7sRS zkmyzBBGJfP*xPjs0tFy#DQ`G00=y({&b9CE*dT>M(df0A>27QQLog zQjZwj-{qeQP4`UAfNxCC;k@<)z~&=eF-X;}ShTAkWR;!*;44VmsN6k0r`Gxyk+I@1 zOo$Ts^0zVCPW+UwBDu-k-|D@2I@`D_(|91&cp%+)=mBRmoa9sH`UT74p=@LOw|?WL z-$<{iPnsLD?MD|(Nq0-i+?uU!e8A}~C;5fb*{1e4w!FIK^~OxofmG9hbkm`veXHtf z%66YvG$m^~SKK>3F;UZ;iAKmdIlJfP@Iz4XOE&J4sCki#6}RkEK_Q_Y+9T4Ye#B>Lwe=kN2=ENV$f0| z!MNnpW_o%O@>h~+zaPd=sYP;=yJvIUKJxsL#nAG=8)sfU^ZMDeZBJ6zBmV(v1Napm zrUui&JdR;r!?jEL#~|V?4u6RgIop6IY(VAS4lasYSb-O zqppVKn#5YDkY!T2SSQv)*{qi;B&%eXoMP3uMN=9d{T8>04bK@gMgmo~S=<7gRbq>{ z74K@XRcyq&M%*rX@vfb#?`TuqXY%^yKwjO9%JJTQ2?qu;8q7Z!@ItDbob&GRUhX?H zfR|j-RJ9YDS4pRX(^3dZQ@3oy*QxqQXnNhk(K0ZO>I8s|i8TL=aBuR%9lZQCWZ&f2 zinGo0x%nt`!!gl+EMB{j$ZoCb1UfMmaFbgd3Ej+h$5wQS(zt&n6!VQkhmOi~pVt@E z*}P2BbltpglYb9b{;|Z$chMN+-o{IQhZ5SOjAHpafF42YNmOgZ_FgCu`?*8Ysu?j* z+o3efN}_K%s;r|4lb4@BFMzctJ61f|+Fl>w@>J?b9gfK_pu?|8YCpOtBpPpWf2ypW zd?wv{r9iLz4V0?x-~^T*R2Gi)dQu;;*4p&_?ypbN0MSJ39&0`DWo`OA>lU^k@@<$r z->gKM6InZ?2dLI-k~HmOMiMkc=v{W~@gn`m`EL|+7h#D+qp)~p6v;OSoo$_RUc7TmPB7HCs=v{gxPzG7{RzJopJL-;m4Yx)%VGg{J$YLB^15&6Zfz5Qhg2P(MQevqD>S(Do~kav8l;T@igdU20ez5Y zYFwuj5wqX$XuV?fT17&yef4lRhggf*!mf8QDa{VG zTDd`n=T#Q@`#>Xq4~gpbtvPQ#IxT-pb+1tJGBqgkZQSbezo*J?Ql(*LdRmg@|3Gj# zo!G*r^X12Z$`2DVG_INo$3*3?sr+?CKzmYI@mllLo^R|25=H>oO%f3|0zO;8g6>Z55p3Mw?B{le!ziINl zJF(pMM%SxdulHoO9ZYRI_|dk*D@{IXwl#b5bmrtp>f}h~vp559i*ZRu{`Ta&5g1LDlb4L^2-J_Yb1ydunB#m!q2*3C5C1g~w0d7F+) za0y#NcY{}qJQpisP3RK>%3LUO&=B;*id#nOpD-kh2~)zHu)J-2tFXyOxMnU!Yr>uu zPOhA{$BM^J)F=3tIZ^n%;AVo~*2+kTe2p})+{ z@*Sr5-a#xlOn+!cnzsjf_G~aV>Gc=H?m%QB6jUa?CE-dOWH|1~OXYA>ltSLN`0+Mx zP=RtD^TOJNH5!t{ZtsPe$Pq7;-%0q0y26AoFz1C08v!?jduiR`#INTO2FQwMh{hx@ zwSz+{81cp?f&P3Rl@z&7rY>3U-QJ7xoL0WKd-q;1=x4hBPIQqX8a0KFAvf0gVY@pl z^1sCgv9rp~WB&B91NLWB@jfL#pyUH2(J`np=?cac1zdFmf`PuR`;Jx?;fUt>ZWXMEB#k)`P5@|TXUS%V*Nj#R&(y^2b^H3 zzE@tGEpJ@5E+ujtzq1!fS?@!=-s8*}xr*wXN$;rs#iwS@U0Z0qHK*gtM)-|Qa(dwU zTxkZqU#;xpx%YWX-$DKRR(D^W{{1>T!Q1V9d-U(`F#rw}r5@TBW#=jm*o!biWj7=i zDGVDPO)e<932p>O{#URCv?8+`2J??01z|)_6%TX?u>v!As(4`6l-Hnq6Df>Qi|nP; z`Z_Q3xAvjOCem0wxOutAUpt8X)4qA_BPI4D<|fhxeC`ru8uU-aM~((K3tj`0g9<}cz^i!3kG7UnXP3Lf2r?Fn1yY<%shjlW>irRy=%JU}X?C36N3+8zXShVRn zhncn|OjDca4GD7zZ^F1KMPTylM&HFow{9}JZH@L})bL_2^%v-|GslhtRB=;Vv@b5b z6ArD$WqXpCHW(g6Im)z`SeTiJvRs{JQRzjcHTJvlG>{0R^&a1g$&a{&jI;ZOe#?tG`_s z+kvqy^9TvemnR$vH|C`xQNHOnyAuvtN>`Vo#!6!xTXvY~F)) zYtiaT@2R#DtU(FOREGwa-r7DD^_ov)z5 z1&dfqZfZvfy9UL&G$>J~(XYMdtAM{pt1rFRdpWSD%0y$Ls*anlPE>E&-n7S2=OGiU z1^~ne^heqbHiC2njqBjF(htiddp;%>$_(H)c{u9HRY@gz^*e90cYZ-e7d(RSG(_PF zRgM*14)3C?LdW(dw;a3E_v-BO1`{~z+md{IbhY2;}%)i8S|ufmvuiVuOCX@po5ABCk5&>`zZ{vkT5#*h>a z`+dv^QUzKP`9oF?cQ67KOrJtPMMNA*ip)TVLLd(2kLC|5`udp8rdDXqRc1*06tW2v zM3jON`9Bg?{RE7zza*#<_Ck1iCMNk}ei^06QtU+^VxgX>9P6l&e}snee`Wo|Xh5|i zSXQ*!DHaBABW!ng7*#uzR|F{G$d18>Y(^+A0_%TP{wvlQU#mK2B0&T+eNbiPAZT*2 zHpS0C#0&mS>4xf#%E5^sBB0txYI&(qv5fbrHgXkcUudN5j*;E$i+%8HiwHo*Fpixy z!&Lf12)tQKW~1t)ARwG*VjhWVUpEo>1`6^?qC&gAsCjMx@`@-TJuH44ulZ#=I}RMm zFR&|bazAr(BIGPT_yg<*d8r{@z31JNliY?+yHL=>4JO#VafQ zR~Glbc=YQ>GoIFzr#0E@v|!FQ zw37zA>-nLD(~BcnSJhH`%C!xoW$hKWhn^o=s$R7>W$iT?dt=JpxLkdwdD;1qy(ecu zAGp>Zdwjupuchtvfd$vS?t|~z)7=9PID_R&e9GRrFp%&#HZI)?SyfH>K=N$>zPu{ii?ld?+QiUHr&?DQ6?R zpEx=9*8J4FJT$r4C%>HWTu$vcb!T_7`}liJ?>)6ru|4g%oU~uYdQ{hEs(MmYJ$Hq4 z)&2$BPh546Xn!B8XUrZSe!$r*r||_xllF$>)?Ig-@6Kd*b|aH@H7-Ava&>3j-i*5~ z{{a8W-ehCXYW=PcTNZ}0wOcZ^ zyHd5gGPV0swfpXleDCr*m;dOQ_nOnSrxu>fZf#l~N@3(DQ(KNL3}@}-x6eL*cInui z{mDK3$-04$?5A?oL}RXwt7*to?@m?k{%-T#@ju#;u0DnhXm;MVJ#S0Z99pognh$3^ zwM$(~N0Xj*EVtQx+wr_3RoA`XST*7qt!ezc#Z)KubSpsVccV62GKn3Ddf^yjGrh-dDE1yAi2q9ZM92Jt=MohDt=y8v3N04xjj|6{cYh7EpJ-xcE7jphrRFj zruJM&w~wUDE@GG4Tp3$K%GR*-?265Mud-&TYPllQv?tZHCtbOB;nY1>`Qm}4V`*1s z(%hMkEzt*SPti90CQ1uVDK3P7ZfPB1s}?=CTDOvSRa>=r{TerrE@snOLSxj60$bn0 zxZ3Q`3lw!2C;#kn=gttv^3HeeM4&KDayGD06B4BMw7@>*pEm$zHsA5Im zZ^?E-3oo=iD{L(2CL5oGwfIH0gzYztuu5##Ksl_y2Ee8SiIV^*LQR-hGM?_j#WGyjfbje*z0bY;st zFvF!-+*@!d8tCd=KzAAQqqzWob&GIO)8IDG)xZ~3WilZG`@-ajNR|;aH#_6A@WBkk^eU#fNw1njLBvw$Lxg5|AtE4 z1x-RzbQBi5mH#Kz3Ghzha!fw%D_hVJ0e&Tb+a?lokaxIB3Y%O-)oxNJ^HJe~dIzO# z+83c4C`@62H_wre5o?N+ooRJi&X}(Rv7|`K4~Y!3bc2GGkqNnu#;{OehgAbaR41=; z(uvtjRg(V)D&nnQ_G-Ax8XA{k--^E!e>st^-)Z*sx1}?$%bZ;}wK#CEcK6-w>Dt4%H{jT{Xvo&L zW$JgN>UZ23NZ0RKG-o~aNl)|g%$*zQwu9Nup2eZ%%9V;XTpe)iBCp-9Pwbq#foU2) zFRNS(WvV(-RUPT79qF>23&wjD)l0?~&Ma7pz|U>2+ee>2x@1{CylU&p?mYB2p2N%g z7YCO1e*MW6&*7x~aL&p#b!EJJQrP@i!(Vr=O{b3A1~4tFBCCiUc+e4IHOc}$C^^0x+adO*MUwET;O|Mx))S8?RChemW&;3nd} zD)f7~?{Dwl&Hedy>wr%9z*se4GJfE-^>^z(=;A5eZR_vVe{hUPx=x}KA7v7K6aY!| zNBJKq^0Zl0Z?k(7ki9yM`)Dy(eLj}-7Z{42nS$8Nn;{NZiFDf&@gm%qSuZmGJ9=#5 zJN}}XMJG|&js)_urtw>jmmDv<793fZC*$f!xjHhgu9U0mPWaucY1f&g`3#eLxcOz2 zC0uocwKR+id>s{X78npOj>*g=9&gpIeLk};z`8ziel_p3|99xhJooYPVlV9$*VTNu z>`$o)Qy@sZ=l%Nc5%dE}81w-_e@;m|PFTFWZR3$_>kydfCwHQrHTv9AyrPCW8x=ED zgn&U`=pp|O20$rUfh9Z)>ZnuAJ{A=4`BbaV2k$*DI{>!(e7O1;qHqCnCZ7+MCjghv zx8{ni&nH8|Y#Pc?%e9MaX9fn&`!3McIW`w+g;sHPaJYE8Ew6Z$s`|cMs$#RnChD&- zm}+D9d2s`AGAib8B(S);N`xRd#>u32F$O2ocG`FefcQIO@{a>hI?&-?az7IW?_2Eu z;)$Q^Kax5VTHPN`3YE#~J!v8Qlcw#dgBMntMsA*2+>#bXSpLdt)77LKP_}1pcE_=7LwB}e7yZ|6$yIJQSnnfHVZa+L8~Sh^ia;}0)ts{s zWaa9+ayEkOTvbQTL6DQHtjoCwa&y&nxiW&v3GxtB!Fj3~;#?JHZFo?PvWI%2L`Tqh z{seD0`0xpx$*|*Ld5gjN=@73o4DtC}A2fmMSPLn=jI~?@K&%B3$qGayk;^}ug)uaB z1FS~3DO)L;15ncW_=%HI+@XsE=_aTX|K4WW6*$|XY+d7g$N1piz`Ka4>X+(C`oux~gblnu^^Iub4@tn&$XiRqV z#<+$B5Vi5TBFovxFRn7=tZ&Cwxd)BuSdY)p4R0J*S}Ufr$aes#w&MMGNs$Grqw@@N zW{$4Gvs+Cv>HadA2{MIwWh*7n{5Y9>da{EOCnaB_WEatVo1hmcVJCApmF=NqFD2im z3R?dBa?_WbIX%Gpf_26Ibj|<}r^aAkaSY~6lrgi+nVf|(R?gs9ah}fE zC}Zagt`+xS&OsR`t2vf)QO3<`p30R`rkvIE<~)?C;0(4E`>|XlWvV!XbHz2BtENm1 zXRxl=PUdPUQ^zuo@fG0{fXrzVuJ#r%IQEm|x9 literal 0 HcmV?d00001 diff --git a/clean-text-to-keywords/__pycache__/test_json_inference.cpython-312.pyc b/clean-text-to-keywords/__pycache__/test_json_inference.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b4979e53fcc20a6ae7d58946c32d5dae2406691 GIT binary patch literal 5126 zcmb_gO>7&-6<%_ed!75n~wEsDR$g8u&X+@i$*mP6);q!+ckGct+ZUS zvrAhPWWWVn)Hx+L%YY6<^b~SYT+LL-EAe928Kq~wS z-{(_7p3E4gsX8=QFg1s&8J5qfE3|Yc&vct74J$*LT3~t5+S8f#U1J%nziiJy~bm8J{ScI^3dpzat8 zTsHjV?Q78bP|Cux{V{Mi;OywE2cFdHcZ2KFa(#Kw)PL54dN2Vmy8NU*<$!dcrXn;$ zO`87J`3})n7`#SM_rGeALun33knoxGXh5of7r>6CFLZEAOT$SzQ-L*e9Ej)$}?oC_Y#pR`5c|l^Rr{xvRRl|E*fcCD1nQKv`IB<%%Q8!n3Es#T}(Le z$?7tkSye;{9@_Cji4Tc(KV){g)LWWur0uoI{il5Voff#0b{WJy>DypuU;6{{c2GJx zURiq9d;US>CFy)Y;x37A^p|6gMk;5jWVY72?aSyF(XWhheK#G;y+{Bs6HLp>aE|=e{47n%HSIGr2!jMab zHZtYU-Qjb8{;Wzq6paHe8Q44rgXG+inPX^{{)%}CVL)O)1)`Y#s~nAgY!IcBfmm(0Awe@Bn2cRod;Mz$OHzI$smtH zmI;LA5)>YmUKYK&meaB{74)DrbXzlv6p)alfwN@utmNqeqnCIH(bg8|gtQCH1eo#m zJ5*b-C~zQPN&uh8cL2FC6Eql>N30Ne875=ai$Y*J0v)@8LeQ~~p~gc$L5{+3SaubK z0RC{6<$(gSIaCR*PJ6>g7Cc_~3K=(7=71cEujJ%Poj|f{(31)^u#~rZc)7relt&C( zH?oGq263Q^AohUcU;`YTbXK*U65K#`&~f6O=(yhi=VC9zTmc^A#}OWT`)aZHR|^P@ z$NFCm3~uz5+197-kqb4y{?{q@$d%V2Io|b}NGDHi^}5NKzYomT`r~gBQ30L9L_lZX zyP$Kxqgr*A7Ar@9rCqF?t&;P*0To+Mc_8~`_ZQt8;SJ|e=ZpARH-7e6e7s5~b}M(f ziLr|2CMLIHZsNkz(^WFpy1U;^eE1X)_|ooffxuJfBI71zw@OuV`5oQ2tK_4DyPvji ztyjrUg@WLN;d03xo!XiLKDZyAG5ZO4{y9|se-}6iMrA$~;h|e5Tqd{-bb^PnhGse5 z*%p_JMhuAfjQrACvGR8;2tk~cm+Ob{kl+M&GW4W4gm{ZlGn#4M0{^O$SBsvp3ilp0 zi+3F(4b3Y+bJpe#I+*{1lRVBN1p+C*f5dhue`7a1%Aa~pX7;Z&CI_+B94cNaM1HL+ zn~F$oq3@O4JpO!`ge5M-`VM=4N3tOTT$?ZCEmE7_20*;$N(?*SG0)rQif|px&BHE< z4PWcHYyukUrBRzY0KPjM)?>frF|4dAfZbl3I_OXofETtjxeH_3t00ihjn?{x1&cfVrY$6Zr&H>ien9@! zbM`>SHt1i|#O2UqYL$jwqF#UtF1A$m;`ce37AFe+KLpIxDKcvw{y{@jRUBfK zx2tL{uNMK3RoO(leQ7(`ANpW> z`n}K(w{HYaLUlgS8Jc}N*cCeGS)wjZA*UIZs^WBtmf_&S^RDHlTe!Dj7}lIDK>wTu zYb?h69NQH1z<%xN~}q}OsF5cr4G{f+e5+whfO;M_+1MdA!l n*tZhCy}qGHO6P;MujNEd>A83951;(*li&a1wJa$E!uNjw?*5c| literal 0 HcmV?d00001 diff --git a/clean-text-to-keywords/__pycache__/test_keyword_extractor.cpython-312.pyc b/clean-text-to-keywords/__pycache__/test_keyword_extractor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f524979db397b00221fbf642126a8fb3377b073 GIT binary patch literal 7374 zcmd5>YiwIbcAonnc}bDBs0T$oY+YGTI+5&+?R|J<(^)^VbvDYwPSRC3Woh0ki85b1 z_tG|1IqPj5tO6vBf<;$KTVS647TcgOLDA1qQvczQe7&Da1%uuh&uJ)uN>O83P*6UPJ=R=)l%2$=DrbpEJQ5!JX zY_R&ahN7>WLR1wSBpA|^S_kDI0Qd2SrPVDn_k z&hu1NQxszL;MOrX)@ZUx$zYSRA(@IH@5-HB-_%Rs6^V>lu(GHZRSy1UUDZ zmmx`YNr6-~EMxO3q^j{4v;8$KL%NJnjQf=RPQKjR>h)WNw`v9tP)VwTaEO{OV+>t3?}diIv-;3+gNWa$AQeBHbvO z${jmV1n>0pVxrgVTfO5qd&aLv#~0LBL+j|qVqSVdE}1nG2`7+9Wb;NbLwqoi_-+xK z;kGE2%^mejKA+)H^t!@$Ckin^PV*In)^6XdA#!j)ik$48c#_|{VssruPQM;e9X>L?HL zk`m(~XkL*WpPigeWRnHQPwq!vZ^abefkr++WDvxnQEM9{3^LbTok9H=HIp6@?2Nk! z<<4C&`!;vUWUEwdncx{k5i!~S1GB`Y*+u!m9(0VoFa$AfsBB(7#bU~YqfR{e>_kl6$~_P`Jys!;wo%wTT|mIhpEtSVO%@7f&Tvm7?UbWT zo3_|4xTV-X;hSEYBCh2qJbB&;2q7V&uTP9)8WyQxbMe$25b+?BNHlJ;TZ2HytaBjP zub zW!ckT(L$cc@~I7m!qq3O6n^!mcHq|cqUr~Y9x55~A4v;rNHm|%T{hlNtpIwkF01VVLWWcJY#}u5OV!`CXN=_JO0O_LP%`uOAXbxa}M@{l+ z%kiE)FG_zRk;)`3E0L(j>Pv1YB#Y1W-EVRA*dSuaF6bhg+|fcW`&PB+^1yX1wlZ`> ziN2=I4|f)M66Q438fYb zCsoJ_9aWT+T)Zgh==xB4QEDg&zxja<{Z9Ml!%G&kK8=rXB;9=)$8wf%6b(4Y58)h! z3JOFB>Jv41BY3~y1A=cdf~W}zew(o!e6wf`3)&*;Tg3=%f^QczLalgDhmfK|w^PtA zLAynLkKlI*ey13tSJdc&_KEskg7%Bn0l^OnIwW+4iT@MnqBO&dkvFA_@;zrnqLtr` z)$>3D7`5w}2q(5N`@u}~7`xwnsG(mi%m5?C9-eSKQ)!cz9OcntCmpZBlhgSeA4V_4 ze%nr_&N}LI$DjFz;|H|MDq> zOeQ&J5=1i`&(Wh#PMmc7Mlv^T@_f;9G&5tSkZDqmCkxqhJVlF!^4T-RoPo^av1amk zxfkguk9zM^kpr_kYKA1Al|fd5mTKjv0P)f2Xj2*ULK@MJ5S3AK)NCGe4NL=}np4G0 zrsQ~YNzU`<9j{%?<;)De50dE&UX)E*Gme%o6!JMUXFJ|ASU85!QZuH>(G_dY8AYDV zrOd-kWEm;x$mtw@wj<-aWrFIIw2ky<&*1g;OcE142dnbOsR?Vsnx|)YKco{t;Yu!3 za5Su1I6E?O9~$^~iO}Ydt*gKq0aZz5towi@Ct#WOvxTD+Hs|y$9cS#G#u$dSeh!bh z#42ikYoHu#U3|8pfO|jKwm4PsfU9^}s2#kIc#Vak%k33E@d3BKiTEH3MHY*d5b@hs zsAJh%*-m^j3$-pDuY`$jVWGCg=PIqlx3N&m;#Vr|#7EpQJBW``Ur(iz_%2t!oA@4= z-$DFNde^y1FY&rt-$(o|nvZ$6pZEcHZiB=RxpNpMe#D(;jQHJd{X@h*O!N72Wt8|a zcMf}q-%C0tDqkZ05trXb{C*bdTpp|(ApXnZy_K&J|5f+BUnBmYtN(T4$K81zBL1*z zpE&VHB)7w(5=q~HM9V)aq+^hx%aOasNm}rIcv1?s2NwJrU97chW%$~+Z?(i1lvjLr z0A)`7+4EPPf3J8edVE27)kQ1ygpMs137wA7I)E&qqk(9$btc;i;m%7n>CK2D!ZXl#I+05Qdk{i!boCn~@OB9Vd5u52=$Lk3P}3mJ26Y-`uJpT3Dn zSPz3x+C8wKmc#8!J73?q79L&=55J?{437enYQZ%vN(duzqwnNx?QxMLzXi9`AwntR)5>?EDeA=d6!bVmpm(P{5iCms%lbQ5f(oW}d zb9*0P)h&AvnyTCNk9GO0T9XU1Td5_Gz-?Gek}ixsO?! zg;}%d3#O5^k6qUWWzsGYd;T zNA`P)2%XahTQZkVBblaig`)jHOE&1)jg~z4IV^eXws!niElC$lBG@hqC`!Q=Fi|!X z083m>f_e=fsAl*<14a{Q&L5KmG%@!*7eMNSAN4AHtRFX>V5ANW zq$w9f>VzTT>E}%S9)i?e0I9q9Q78DQ3-qYhz@zSBN4*9e^*VCo&%+?oHKT|#J^cr7 z&~qiu&rsWMxBc|7U(Ee^oKya(w*5t1+=@@XxAUd{eEPk=it{A3{nyhZ|4p3Jc+-Kk zHw%XXe;?;B-0Sy`aZcHPI`*B{{`RAhpT+sNNqP%W2IwfhTt)9W>fPeP zgvaP3#N+}kI0`-x9L;U`wOsAmfzG}dL;wwb5n}zap?WVhAVRaavY<&>3B#m)MxajT zrYQf3+;{K+;rcWBQcAF z2Ow{UhM#x<^0vj(l_uf=$f-U=JOFugxvjFDcmc?dRl>vrkau9rR^kE3Ba3#Wop=Cp zs_!5kfV^{gpwdY^06C4>O*{ZO&0z=e0OZ8?5)VM$vbev}M?3&I)%Oz*K#no*4uW53 z+8AN&qvhtdCGB;s+}>I4=qh)1)BkAyM#x7X7jtXrUJLJ94ewgne=|I?;C-LaRqv|S z`$-w^m1r%X2p9;8?H&SD?_(%xl?VMTAsS44b~)R-qwai%;t~4 z>aoD4D-)+l+&E=TdnRx2A{~wwiF}{P?-F^5$ZJG4-SZuxIwGU^S#%(Acj9UOP~Pu3 zA+0OS8?Lx)Mp8WAlGX!?=ZK(>3VK4?2*f-i?%Ze?`g?)Etch3vpN9CG37tbHTy=!n zWm2AtFGTk~wh#S;FF4w1C?8D%EYnzGmH7Sz1l~05PqcCOPZ5XtkD!P{$Sm^WiXutU zzp(B9&c;6WdZj~m8OX;iEmGtz1KHT^m4>eO?%6=~&_+v9l17%lxz;nf+B3SrNU9VR erUn=0ZYZ5)HGJutKX~H%PrURbPPW=5miYe|WB*|Q literal 0 HcmV?d00001 diff --git a/clean-text-to-keywords/example_usage.py b/clean-text-to-keywords/example_usage.py new file mode 100644 index 0000000..61b274b --- /dev/null +++ b/clean-text-to-keywords/example_usage.py @@ -0,0 +1,36 @@ +import argparse +import json +from typing import Sequence + +from keyword_extractor import KeywordExtractor + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Extract normalized keywords from cleaned text.", + ) + parser.add_argument( + "text", + nargs="+", + help="Input text to process. Pass as one quoted string or multiple words.", + ) + parser.add_argument( + "--model", + default="en_core_web_sm", + help="spaCy model name (default: en_core_web_sm).", + ) + return parser + + +def main(argv: Sequence[str] | None = None) -> None: + parser = _build_parser() + args = parser.parse_args(argv) + + text = " ".join(args.text) + extractor = KeywordExtractor.from_default_model(model_name=args.model) + keywords = extractor.extract(text) + print(json.dumps(keywords)) + + +if __name__ == "__main__": + main() diff --git a/clean-text-to-keywords/infer_json_usage.py b/clean-text-to-keywords/infer_json_usage.py new file mode 100644 index 0000000..7b8d553 --- /dev/null +++ b/clean-text-to-keywords/infer_json_usage.py @@ -0,0 +1,111 @@ +import argparse +import json +import os +import re +from typing import Sequence + +from keyword_extractor import KeywordExtractor +from json_inference import fill_template_from_keywords + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Extract keywords and infer values into a JSON template.", + ) + parser.add_argument( + "text", + nargs="*", + help="Input description text.", + ) + parser.add_argument( + "--template", + default="", + help="Path to JSON template file with keys only. If omitted, full inferred JSON is returned.", + ) + parser.add_argument( + "--model", + default="en_core_web_sm", + help="spaCy model name (default: en_core_web_sm).", + ) + parser.add_argument( + "--keywords", + nargs="+", + default=None, + help="Provide keywords directly instead of raw text.", + ) + parser.add_argument( + "--json-only", + action="store_true", + help="Print only inferred JSON (skip keyword list output).", + ) + return parser + + +def _load_template(path: str): + if not path: + return {} + + if not os.path.exists(path): + raise FileNotFoundError(f"Template file not found: {path}") + + with open(path, "r", encoding="utf-8") as file_handle: + raw = file_handle.read().strip() + if not raw: + return {} + return json.loads(raw) + + +def _parse_keywords_fragment(raw: str): + if not raw.strip(): + return [] + + try: + parsed = json.loads(raw) + if isinstance(parsed, list): + return [str(item).strip().lower() for item in parsed if str(item).strip()] + except json.JSONDecodeError: + pass + + tokens = re.findall(r"[a-zA-Z0-9_-]+", raw.lower()) + return [token for token in tokens if token] + + +def _extract_keywords(args): + if args.keywords: + return [word.strip().lower() for word in args.keywords if word.strip()] + + if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["): + raw = " ".join([args.template] + args.text) + return _parse_keywords_fragment(raw) + + if not args.text: + raise ValueError("Provide input text or use --keywords.") + + text = " ".join(args.text) + extractor = KeywordExtractor.from_default_model(model_name=args.model) + return extractor.extract(text) + + +def main(argv: Sequence[str] | None = None) -> None: + parser = _build_parser() + args = parser.parse_args(argv) + + keywords = _extract_keywords(args) + + template_path = args.template + if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["): + template_path = "" + + template = _load_template(template_path) + inferred_json = fill_template_from_keywords(template, keywords) + + if args.json_only: + print(json.dumps(inferred_json, indent=2)) + return + + print(json.dumps(keywords)) + print(json.dumps(inferred_json, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/clean-text-to-keywords/json_inference.py b/clean-text-to-keywords/json_inference.py new file mode 100644 index 0000000..da1a960 --- /dev/null +++ b/clean-text-to-keywords/json_inference.py @@ -0,0 +1,398 @@ +"""Infer Pokemon-like JSON values from extracted keywords.""" + +from __future__ import annotations + +from copy import deepcopy +from typing import Any, Dict, Iterable, List, Mapping, Sequence + +POKEMON_TYPES = { + "normal", + "fire", + "water", + "grass", + "electric", + "ice", + "fighting", + "poison", + "ground", + "flying", + "psychic", + "bug", + "rock", + "ghost", + "dragon", + "dark", + "steel", + "fairy", +} + +HABITAT_KEYWORDS = { + "forest", + "field", + "cave", + "mountain", + "river", + "ocean", + "sea", + "tunnel", + "nest", + "sky", + "desert", + "swamp", + "volcano", +} + +PERSONALITY_KEYWORDS = { + "calm", + "gentle", + "agile", + "playful", + "cheerful", + "energetic", + "curious", + "fierce", + "brave", + "loyal", + "timid", + "bold", +} + +MOVE_KEYWORDS = { + "attack", + "smash", + "strike", + "kick", + "punch", + "shock", + "thunder", + "bolt", + "blast", + "explosion", + "freeze", + "bite", + "claw", + "tail", + "fight", +} + +ABILITY_KEYWORDS = { + "recover", + "endurance", + "explore", + "hide", + "wander", + "bond", + "speed", + "power", + "energy", + "flexible", +} + +STAT_HINTS = { + "hp": {"endurance", "recover", "energy", "stamina", "healthy", "vital"}, + "attack": {"attack", "smash", "strike", "punch", "kick", "claw", "fight", "power"}, + "defense": {"armor", "shield", "tough", "hard", "resist", "solid"}, + "speed": {"speed", "swift", "agile", "quick", "fast", "dash"}, +} + +KEY_ALIASES = { + "name": {"name", "pokemon_name"}, + "type": {"type", "primary_type", "pokemon_type"}, + "secondary_type": {"secondary_type", "type2", "secondary"}, + "attacks": {"attacks", "moves", "skills", "offense"}, + "abilities": {"abilities", "traits", "passives", "special_abilities"}, + "habitat": {"habitat", "environment", "region"}, + "personality": {"personality", "temperament", "nature"}, + "description": {"description", "flavor_text", "summary", "lore"}, + "keywords": {"keywords", "tags"}, + "hp": {"hp", "health", "health_points"}, + "attack": {"attack", "atk"}, + "defense": {"defense", "def"}, + "speed": {"speed", "spd"}, +} + +GENERIC_NAME_BLACKLIST = { + "black", + "white", + "yellow", + "red", + "blue", + "green", + "purple", + "orange", + "pink", + "gray", + "grey", + "brown", + "fur", + "body", + "tail", + "claw", + "storm", + "cloud", + "enemy", + "super", + "scary", + "giant", + "speed", +} + +TYPE_WEAKNESS = { + "normal": "fighting", + "fire": "water", + "water": "electric", + "grass": "fire", + "electric": "ground", + "ice": "fire", + "fighting": "psychic", + "poison": "ground", + "ground": "water", + "flying": "electric", + "psychic": "dark", + "bug": "fire", + "rock": "water", + "ghost": "dark", + "dragon": "fairy", + "dark": "fighting", + "steel": "fire", + "fairy": "steel", +} + + +def _title_case(value: str) -> str: + return " ".join(part.capitalize() for part in value.split()) + + +def _is_empty_value(value: Any) -> bool: + if value is None: + return True + if isinstance(value, str): + return value.strip() == "" + if isinstance(value, (list, dict, tuple, set)): + return len(value) == 0 + return False + + +def _canonical_key(key: str) -> str: + lowered = key.lower().strip() + for canonical, aliases in KEY_ALIASES.items(): + if lowered in aliases: + return canonical + return lowered + + +def _pick_name(keywords: Sequence[str]) -> str: + for keyword in keywords: + if keyword in POKEMON_TYPES: + continue + if keyword in HABITAT_KEYWORDS: + continue + if keyword in MOVE_KEYWORDS: + continue + if keyword in ABILITY_KEYWORDS: + continue + if keyword in PERSONALITY_KEYWORDS: + continue + if keyword in GENERIC_NAME_BLACKLIST: + continue + if len(keyword) < 4: + continue + return _title_case(keyword) + return "Unknown" + + +def _pick_types(keywords: Sequence[str]) -> List[str]: + types: List[str] = [] + for keyword in keywords: + if keyword in POKEMON_TYPES and keyword not in types: + types.append(keyword) + if len(types) >= 2: + break + if not types: + types.append("normal") + return types + + +def _pick_habitat(keywords: Sequence[str]) -> str: + habitats = [word for word in keywords if word in HABITAT_KEYWORDS] + if not habitats: + return "unknown" + return habitats[0] + + +def _pick_personality(keywords: Sequence[str]) -> List[str]: + result: List[str] = [] + for keyword in keywords: + if keyword in PERSONALITY_KEYWORDS and keyword not in result: + result.append(keyword) + return result[:3] + + +def _pick_attacks(keywords: Sequence[str]) -> List[str]: + attacks: List[str] = [] + for keyword in keywords: + if keyword in MOVE_KEYWORDS and keyword not in attacks: + attacks.append(keyword) + return attacks[:4] + + +def _pick_abilities(keywords: Sequence[str]) -> List[str]: + abilities: List[str] = [] + for keyword in keywords: + if keyword in ABILITY_KEYWORDS and keyword not in abilities: + abilities.append(keyword) + return abilities[:4] + + +def _score_stat(base: int, keywords: Sequence[str], hints: Iterable[str]) -> int: + hint_set = set(hints) + matches = sum(1 for keyword in keywords if keyword in hint_set) + # Each match adds 10 points; keep stats in [40, 160]. + return max(40, min(160, base + (matches * 10))) + + +def _build_description(name: str, primary_type: str, attacks: Sequence[str], abilities: Sequence[str], habitat: str) -> str: + attack_text = ", ".join(attacks) if attacks else "basic combat" + ability_text = ", ".join(abilities) if abilities else "balanced adaptation" + return ( + f"{name} is a {primary_type}-type Pokemon often found in {habitat}. " + f"It commonly uses {attack_text} and shows abilities like {ability_text}." + ) + + +def _retreat_cost_from_speed(speed: int) -> int: + if speed >= 120: + return 0 + if speed >= 90: + return 1 + if speed >= 70: + return 2 + return 3 + + +def _attack_damage_from_attack_stat(attack_stat: int, index: int) -> int: + # Keep card damage in simple 10-step increments. + base = 30 + max(0, attack_stat - 70) // 2 + adjusted = base + (index * 10) + return max(10, min(160, (adjusted // 10) * 10)) + + +def _energy_name_for_type(pokemon_type: str) -> str: + if pokemon_type == "normal": + return "Colorless" + return _title_case(pokemon_type) + + +def _fill_tcg_like_template(output: Dict[str, Any], inferred: Mapping[str, Any]) -> None: + if "name" in output and _is_empty_value(output.get("name")): + output["name"] = inferred["name"] + + if "description" in output and _is_empty_value(output.get("description")): + output["description"] = inferred["description"] + + if "hp" in output and _is_empty_value(output.get("hp")): + hp_value = inferred["hp"] + output["hp"] = str(hp_value) if isinstance(output.get("hp"), str) else hp_value + + if "types" in output and isinstance(output.get("types"), list): + types_value = output["types"] + if len(types_value) == 0 or all(_is_empty_value(item) for item in types_value): + inferred_types = [inferred["type"]] + if inferred.get("secondary_type"): + inferred_types.append(inferred["secondary_type"]) + output["types"] = inferred_types + + if "stage" in output and _is_empty_value(output.get("stage")): + output["stage"] = "Basic" + + if "retreat" in output and (output.get("retreat") in (None, 0, "")): + output["retreat"] = _retreat_cost_from_speed(int(inferred["speed"])) + + if "weaknesses" in output and isinstance(output.get("weaknesses"), list): + weaknesses = output["weaknesses"] + if weaknesses: + weakness_type = TYPE_WEAKNESS.get(inferred["type"], "fighting") + first = weaknesses[0] + if isinstance(first, dict): + if _is_empty_value(first.get("type")): + first["type"] = weakness_type + if _is_empty_value(first.get("value")): + first["value"] = "x2" + + if "attacks" in output and isinstance(output.get("attacks"), list): + attack_entries = output["attacks"] + inferred_attacks = inferred["attacks"] + inferred_type = inferred["type"] + for idx, attack_entry in enumerate(attack_entries): + if not isinstance(attack_entry, dict): + continue + + attack_name = inferred_attacks[idx] if idx < len(inferred_attacks) else "tackle" + attack_title = _title_case(attack_name) + if _is_empty_value(attack_entry.get("name")): + attack_entry["name"] = attack_title + if _is_empty_value(attack_entry.get("effect")): + attack_entry["effect"] = f"Deals damage with {attack_name}." + + if "damage" in attack_entry and (attack_entry.get("damage") in (None, 0, "")): + attack_entry["damage"] = _attack_damage_from_attack_stat(int(inferred["attack"]), idx) + + if "cost" in attack_entry and isinstance(attack_entry.get("cost"), list): + current_cost = attack_entry["cost"] + if len(current_cost) == 0 or all(_is_empty_value(item) for item in current_cost): + attack_entry["cost"] = [_energy_name_for_type(inferred_type)] + + +def infer_profile_from_keywords(keywords: Sequence[str]) -> Dict[str, Any]: + cleaned = [k.strip().lower() for k in keywords if k and k.strip()] + + name = _pick_name(cleaned) + types = _pick_types(cleaned) + attacks = _pick_attacks(cleaned) + abilities = _pick_abilities(cleaned) + habitat = _pick_habitat(cleaned) + personality = _pick_personality(cleaned) + + hp = _score_stat(70, cleaned, STAT_HINTS["hp"]) + attack = _score_stat(70, cleaned, STAT_HINTS["attack"]) + defense = _score_stat(70, cleaned, STAT_HINTS["defense"]) + speed = _score_stat(70, cleaned, STAT_HINTS["speed"]) + + return { + "name": name, + "type": types[0], + "secondary_type": types[1] if len(types) > 1 else None, + "attacks": attacks, + "abilities": abilities, + "habitat": habitat, + "personality": personality, + "hp": hp, + "attack": attack, + "defense": defense, + "speed": speed, + "keywords": cleaned, + "description": _build_description(name, types[0], attacks, abilities, habitat), + } + + +def fill_template_from_keywords(template: Mapping[str, Any], keywords: Sequence[str]) -> Dict[str, Any]: + """Fill a key-only template by inferring values from keywords. + + Existing non-empty values in template are preserved. + """ + inferred = infer_profile_from_keywords(keywords) + output: Dict[str, Any] = deepcopy(dict(template)) + + if not output: + return inferred + + _fill_tcg_like_template(output, inferred) + + for key, current_value in output.items(): + canonical = _canonical_key(key) + if canonical not in inferred: + continue + if _is_empty_value(current_value): + output[key] = inferred[canonical] + + return output diff --git a/clean-text-to-keywords/json_template_example.json b/clean-text-to-keywords/json_template_example.json new file mode 100644 index 0000000..7fa07bb --- /dev/null +++ b/clean-text-to-keywords/json_template_example.json @@ -0,0 +1,35 @@ +{ + "category": "Pokemon", + "name": "", + "rarity": "", + "hp": "", + "types": [""], + "evolveFrom": "", + "description": "", + "stage": "", + "attacks": [ + { + "cost": [""], + "name": "", + "effect": "" + }, + { + "cost": [""], + "name": "", + "effect": "", + "damage": 0 + } + ], + "weaknesses": [ + { + "type": "", + "value": "" + } + ], + "retreat": 0, + "regulationMark": "", + "legal": { + "standard": true, + "expanded": true + } +} diff --git a/clean-text-to-keywords/keyword_extractor.py b/clean-text-to-keywords/keyword_extractor.py new file mode 100644 index 0000000..227a30f --- /dev/null +++ b/clean-text-to-keywords/keyword_extractor.py @@ -0,0 +1,248 @@ +"""Rule-based keyword extraction and normalization for Pokemon card generation.""" + +from __future__ import annotations + +import math +import re +from dataclasses import dataclass, field +from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple + +DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = { + "normal": ["basic", "common", "regular", "plain", "normaltype"], + "fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"], + "water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"], + "grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"], + "flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"], + "fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"], + "poison": ["toxic", "venom", "acid", "poisonous", "toxin"], + "electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"], + "ground": ["earth", "soil", "sand", "mud", "quake", "dust"], + "rock": ["stone", "boulder", "crystal", "rocky", "pebble"], + "psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"], + "ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"], + "bug": ["insect", "ant", "beetle", "spider", "crawler"], + "ghost": ["spirit", "phantom", "haunt", "shadow", "specter"], + "steel": ["metal", "iron", "armor", "blade", "alloy"], + "dragon": ["drake", "wyrm", "serpent", "legendary"], + "dark": ["shadow", "evil", "night", "doom", "darkness"], + "fairy": ["magic", "magical", "sparkle", "light", "charm"], + "explosion": ["explosive", "explode", "blast"], +} + +DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB") +DEFAULT_IGNORED_KEYWORDS: Set[str] = {"preevolution", "pokmon"} +DEFAULT_POS_WEIGHTS: Dict[str, float] = { + "NOUN": 3.0, + "ADJ": 2.0, + "VERB": 1.0, +} +DEFAULT_KEEP_RATIO = 0.8 +DEFAULT_MIN_KEYWORDS = 12 +DEFAULT_MAX_KEYWORDS = 30 + + +def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]: + """Build synonym -> canonical mapping for O(1) normalization lookup.""" + inverse: Dict[str, str] = {} + for canonical, synonyms in normalization_map.items(): + canonical_normalized = canonical.strip().lower() + inverse[canonical_normalized] = canonical_normalized + for synonym in synonyms: + synonym_normalized = synonym.strip().lower() + if synonym_normalized: + inverse[synonym_normalized] = canonical_normalized + return inverse + + +def _tokenize_keyword_phrase(value: str) -> List[str]: + return re.findall(r"[a-z0-9]+", value.lower()) + + +@dataclass +class KeywordExtractor: + """Deterministic spaCy + YAKE + rule-based normalization pipeline.""" + + nlp: Any + normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP) + allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS) + ignored_keywords: Set[str] = field(default_factory=lambda: set(DEFAULT_IGNORED_KEYWORDS)) + pos_weights: Mapping[str, float] = field(default_factory=lambda: DEFAULT_POS_WEIGHTS) + keep_ratio: float = DEFAULT_KEEP_RATIO + min_keywords: int = DEFAULT_MIN_KEYWORDS + max_keywords: int = DEFAULT_MAX_KEYWORDS + use_yake: bool = True + + def __post_init__(self) -> None: + self._normalization_lookup = _invert_normalization_map(self.normalization_map) + self._allowed_pos_set = set(self.allowed_pos) + self._ignored_keywords = {keyword.lower().strip() for keyword in self.ignored_keywords} + self._pos_weight_lookup = {k.upper(): float(v) for k, v in self.pos_weights.items()} + + @classmethod + def from_default_model( + cls, + model_name: str = "en_core_web_sm", + normalization_map: Optional[Mapping[str, Iterable[str]]] = None, + allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS, + ignored_keywords: Optional[Set[str]] = None, + pos_weights: Mapping[str, float] = DEFAULT_POS_WEIGHTS, + keep_ratio: float = DEFAULT_KEEP_RATIO, + min_keywords: int = DEFAULT_MIN_KEYWORDS, + max_keywords: int = DEFAULT_MAX_KEYWORDS, + use_yake: bool = True, + ) -> "KeywordExtractor": + """Initialize extractor with a spaCy English pipeline.""" + try: + import spacy + + nlp = spacy.load(model_name) + except OSError as exc: + raise OSError( + f"spaCy model '{model_name}' is not installed. " + "Run: python -m spacy download en_core_web_sm" + ) from exc + except Exception as exc: + raise RuntimeError( + "spaCy could not be loaded in this Python environment. " + "Try Python 3.13 or lower, then install spaCy and en_core_web_sm." + ) from exc + + return cls( + nlp=nlp, + normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP, + allowed_pos=allowed_pos, + ignored_keywords=ignored_keywords or set(DEFAULT_IGNORED_KEYWORDS), + pos_weights=pos_weights, + keep_ratio=keep_ratio, + min_keywords=min_keywords, + max_keywords=max_keywords, + use_yake=use_yake, + ) + + def extract(self, text: str) -> List[str]: + """Extract, normalize and rank keywords from already-cleaned text.""" + if not text or not text.strip(): + return [] + + doc = self.nlp(text) + + # Step 1: POS filtering + lowercase lemma/token extraction. + raw_keywords: List[Tuple[str, str]] = [] + for token in doc: + if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set: + continue + + base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip() + if base and base not in self._ignored_keywords: + raw_keywords.append((base, token.pos_)) + + # Step 2: Deduplicate before domain normalization. + deduplicated: List[Tuple[str, str]] = [] + seen_raw: Set[str] = set() + for keyword, pos in raw_keywords: + if keyword in seen_raw: + continue + seen_raw.add(keyword) + deduplicated.append((keyword, pos)) + + # Step 3: Normalize and deduplicate canonical forms. + unique_entries: List[Tuple[str, str, str, int]] = [] + seen_normalized: Set[str] = set() + for index, (original_keyword, pos) in enumerate(deduplicated): + normalized_keyword = self._normalize_keyword(original_keyword) + if normalized_keyword in seen_normalized: + continue + seen_normalized.add(normalized_keyword) + unique_entries.append((original_keyword, normalized_keyword, pos, index)) + + if not unique_entries: + return [] + + if not self.use_yake: + return [normalized_keyword for _, normalized_keyword, _, _ in unique_entries] + + # Step 4: YAKE scoring + conservative selection to preserve detail. + yake_scores = self._extract_yake_scores(text) + if not yake_scores: + return [normalized_keyword for _, normalized_keyword, _, _ in unique_entries] + + ranked: List[Tuple[float, int, str]] = [] + for original_keyword, normalized_keyword, pos, index in unique_entries: + score_candidates: List[float] = [] + if original_keyword in yake_scores: + score_candidates.append(yake_scores[original_keyword]) + if normalized_keyword in yake_scores: + score_candidates.append(yake_scores[normalized_keyword]) + + # Missing score is treated as moderately relevant to avoid over-pruning. + yake_penalty = min(score_candidates) if score_candidates else 0.45 + pos_weight = self._pos_weight_lookup.get(pos.upper(), 1.0) + combined_score = (1.0 - yake_penalty) * pos_weight + ranked.append((combined_score, index, normalized_keyword)) + + target_count = self._compute_target_count(len(ranked)) + ranked.sort(key=lambda item: (-item[0], item[1])) + selected = ranked[:target_count] + selected.sort(key=lambda item: item[1]) + + return [keyword for _, _, keyword in selected] + + def _compute_target_count(self, total_keywords: int) -> int: + if total_keywords <= 0: + return 0 + + target = max(self.min_keywords, math.ceil(total_keywords * self.keep_ratio)) + if self.max_keywords > 0: + target = min(target, self.max_keywords) + return min(target, total_keywords) + + def _extract_yake_scores(self, text: str) -> Dict[str, float]: + try: + import yake + except Exception: + return {} + + text_token_count = len(text.split()) + top_n = max(20, min(80, text_token_count * 2)) + + try: + extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, top=top_n) + phrase_scores = extractor.extract_keywords(text) + except Exception: + return {} + + token_scores: Dict[str, float] = {} + for phrase, score in phrase_scores: + for token in _tokenize_keyword_phrase(phrase): + existing = token_scores.get(token) + if existing is None or score < existing: + token_scores[token] = score + + if not token_scores: + return {} + + values = list(token_scores.values()) + min_score = min(values) + max_score = max(values) + + if math.isclose(min_score, max_score): + return {token: 0.5 for token in token_scores} + + # Normalize so 0.0=most important and 1.0=least important. + return { + token: (score - min_score) / (max_score - min_score) + for token, score in token_scores.items() + } + + def _normalize_keyword(self, keyword: str) -> str: + keyword_lower = keyword.lower() + return self._normalization_lookup.get(keyword_lower, keyword_lower) + + +def extract_keywords( + text: str, + extractor: Optional[KeywordExtractor] = None, +) -> List[str]: + """Convenience API to extract keywords with default extractor config.""" + active_extractor = extractor or KeywordExtractor.from_default_model() + return active_extractor.extract(text) diff --git a/clean-text-to-keywords/requirements.txt b/clean-text-to-keywords/requirements.txt new file mode 100644 index 0000000..2349e69 --- /dev/null +++ b/clean-text-to-keywords/requirements.txt @@ -0,0 +1,2 @@ +spacy>=3.7.0 +yake>=0.4.2 diff --git a/clean-text-to-keywords/test_json_inference.py b/clean-text-to-keywords/test_json_inference.py new file mode 100644 index 0000000..101cebf --- /dev/null +++ b/clean-text-to-keywords/test_json_inference.py @@ -0,0 +1,143 @@ +import unittest + +from json_inference import fill_template_from_keywords, infer_profile_from_keywords + + +class JsonInferenceTests(unittest.TestCase): + def test_profile_inference_basics(self) -> None: + keywords = [ + "zapthorn", + "electric", + "wolf", + "thunder", + "claw", + "speed", + "storm", + "agile", + "forest", + "recover", + "energy", + ] + + profile = infer_profile_from_keywords(keywords) + + self.assertEqual(profile["name"], "Zapthorn") + self.assertEqual(profile["type"], "electric") + self.assertIn("thunder", profile["attacks"]) + self.assertIn("claw", profile["attacks"]) + self.assertIn("recover", profile["abilities"]) + self.assertEqual(profile["habitat"], "forest") + self.assertGreaterEqual(profile["speed"], 80) + + def test_fill_key_only_template(self) -> None: + template = { + "name": "", + "type": "", + "secondary_type": None, + "attacks": [], + "abilities": [], + "habitat": "", + "personality": [], + "hp": None, + "attack": None, + "defense": None, + "speed": None, + "description": "", + "keywords": [], + } + + keywords = [ + "furret", + "normal", + "tail", + "smash", + "tunnel", + "agile", + "cheerful", + "explore", + "endurance", + ] + + result = fill_template_from_keywords(template, keywords) + + self.assertEqual(result["name"], "Furret") + self.assertEqual(result["type"], "normal") + self.assertIn("smash", result["attacks"]) + self.assertIn("explore", result["abilities"]) + self.assertEqual(result["habitat"], "tunnel") + self.assertIn("cheerful", result["personality"]) + self.assertIsInstance(result["description"], str) + self.assertGreater(len(result["description"]), 20) + + def test_fill_tcg_style_template(self) -> None: + template = { + "category": "Pokemon", + "name": "", + "hp": "", + "types": [""], + "description": "", + "stage": "", + "attacks": [ + {"cost": [""], "name": "", "effect": ""}, + {"cost": [""], "name": "", "effect": "", "damage": 0}, + ], + "weaknesses": [{"type": "", "value": ""}], + "retreat": 0, + } + + keywords = [ + "zapthorn", + "electric", + "thunder", + "claw", + "speed", + "storm", + "energy", + ] + + result = fill_template_from_keywords(template, keywords) + + self.assertEqual(result["name"], "Zapthorn") + self.assertEqual(result["types"], ["electric"]) + self.assertEqual(result["stage"], "Basic") + self.assertTrue(result["hp"].isdigit()) + self.assertEqual(result["weaknesses"][0]["type"], "ground") + self.assertEqual(result["weaknesses"][0]["value"], "x2") + self.assertEqual(result["attacks"][0]["name"], "Thunder") + self.assertEqual(result["attacks"][1]["name"], "Claw") + self.assertEqual(result["attacks"][0]["cost"], ["Electric"]) + self.assertGreaterEqual(result["retreat"], 0) + + def test_name_fallback_to_unknown_for_generic_tokens(self) -> None: + keywords = [ + "black", + "fur", + "giant", + "electric", + "claw", + "speed", + "storm", + ] + + profile = infer_profile_from_keywords(keywords) + self.assertEqual(profile["name"], "Unknown") + + def test_preserves_existing_values(self) -> None: + template = { + "name": "CustomName", + "type": "electric", + "attacks": [], + "description": "Already set", + } + keywords = ["furret", "normal", "attack"] + + result = fill_template_from_keywords(template, keywords) + + self.assertEqual(result["name"], "CustomName") + self.assertEqual(result["type"], "electric") + self.assertEqual(result["description"], "Already set") + self.assertIn("attack", result["attacks"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/clean-text-to-keywords/test_keyword_extractor.py b/clean-text-to-keywords/test_keyword_extractor.py new file mode 100644 index 0000000..5e46dd8 --- /dev/null +++ b/clean-text-to-keywords/test_keyword_extractor.py @@ -0,0 +1,166 @@ +import unittest + +from keyword_extractor import KeywordExtractor + + +class FakeToken: + def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None: + self.text = text + self.pos_ = pos + self.lemma_ = lemma + self.is_stop = is_stop + self.is_punct = not any(ch.isalnum() for ch in text) + + +class FakeNLP: + def __init__(self, tag_map, stopwords) -> None: + self.tag_map = tag_map + self.stopwords = stopwords + + def __call__(self, text: str): + tokens = [] + for raw in text.split(): + token_text = raw.strip() + lowered = token_text.lower() + tokens.append( + FakeToken( + text=token_text, + pos=self.tag_map.get(lowered, "NOUN"), + lemma=lowered, + is_stop=lowered in self.stopwords, + ) + ) + return tokens + + +class TestableKeywordExtractor(KeywordExtractor): + def __init__(self, *args, yake_scores=None, **kwargs): + super().__init__(*args, **kwargs) + self._test_yake_scores = yake_scores or {} + + def _extract_yake_scores(self, text: str): + return self._test_yake_scores + + +class KeywordExtractorTests(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + tag_map = { + "fiery": "ADJ", + "dragon": "NOUN", + "attack": "VERB", + "explosive": "ADJ", + "flames": "NOUN", + "burning": "ADJ", + "creature": "NOUN", + "with": "ADP", + "blaze": "NOUN", + "and": "CCONJ", + "dangerous": "ADJ", + "electric": "ADJ", + "mouse": "NOUN", + "using": "VERB", + "thunder": "NOUN", + "shock": "NOUN", + "strong": "ADJ", + "furret": "NOUN", + "long": "ADJ", + "slender": "ADJ", + "soft": "ADJ", + "fur": "NOUN", + "flexible": "ADJ", + "body": "NOUN", + "move": "VERB", + "gracefully": "ADJ", + "narrow": "ADJ", + "tunnel": "NOUN", + "tail": "NOUN", + "smash": "VERB", + "opponent": "NOUN", + "battle": "NOUN", + "cheerful": "ADJ", + "endurance": "NOUN", + } + + stopwords = { + "a", + "very", + "and", + "with", + "the", + "it", + "to", + "its", + "that", + "through", + "in", + } + cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords) + cls.extractor = KeywordExtractor(nlp=cls.nlp, use_yake=False) + + def test_readme_main_example(self) -> None: + text = "fiery dragon attack explosive flames" + result = self.extractor.extract(text) + self.assertEqual(result, ["fire", "dragon", "attack", "explosion"]) + + def test_synonym_normalization(self) -> None: + text = "burning creature with blaze power" + result = self.extractor.extract(text) + self.assertEqual(result, ["fire", "creature", "power"]) + + def test_mixed_types(self) -> None: + text = "electric mouse using thunder shock" + result = self.extractor.extract(text) + self.assertEqual(result, ["electric", "mouse", "using"]) + + def test_noise_input(self) -> None: + text = "a very very strong and dangerous creature" + result = self.extractor.extract(text) + self.assertEqual(result, ["strong", "dangerous", "creature"]) + + def test_yake_keeps_detailed_information(self) -> None: + text = ( + "furret long slender creature soft fur flexible body move gracefully narrow tunnel " + "tail smash opponent battle cheerful endurance" + ) + + yake_scores = { + "furret": 0.00, + "creature": 0.05, + "tail": 0.08, + "battle": 0.10, + "smash": 0.12, + "tunnel": 0.14, + "endurance": 0.18, + "body": 0.20, + "cheerful": 0.22, + "slender": 0.26, + "flexible": 0.28, + "gracefully": 0.34, + "narrow": 0.40, + "long": 0.42, + "soft": 0.44, + "fur": 0.45, + "move": 0.48, + "opponent": 0.52, + } + extractor = TestableKeywordExtractor( + nlp=self.nlp, + use_yake=True, + keep_ratio=0.8, + min_keywords=10, + max_keywords=30, + yake_scores=yake_scores, + ) + + result = extractor.extract(text) + + self.assertGreaterEqual(len(result), 10) + self.assertIn("furret", result) + self.assertIn("creature", result) + self.assertIn("tail", result) + self.assertIn("tunnel", result) + + +if __name__ == "__main__": + unittest.main() diff --git a/fetch_card.py b/fetch_card.py new file mode 100644 index 0000000..71b94c7 --- /dev/null +++ b/fetch_card.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +""" +Download Pokémon TCG card images with embedded JSON metadata. + +Uses the TCGdex SDK to: +1. List all sets (with configurable limit) +2. For each set, list all cards (with configurable limit) +3. Download each card image (PNG) and embed full card data as PNG metadata +""" + +import json +import io +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import asdict, is_dataclass +from pathlib import Path + +from PIL import Image, PngImagePlugin +from tcgdexsdk import TCGdex, Language +from tcgdexsdk.enums import Quality, Extension + +# ── Configuration ────────────────────────────────────────────── +MAX_SETS = 10000 # Number of sets to process (None = all) +MAX_CARDS_PER_SET = 10000 # Number of cards per set (None = all) +OUTPUT_DIR = Path(__file__).resolve().parent / "cards" +IMAGE_QUALITY = Quality.HIGH +MAX_WORKERS = 8 # Parallel download threads +# ─────────────────────────────────────────────────────────────── + + +def card_to_dict(card) -> dict: + """Convert a card object to a JSON-serialisable dict, skipping SDK internals.""" + data = {} + skip = {"sdk", "get_image", "get_image_url"} + for attr in dir(card): + if attr.startswith("_") or attr in skip: + continue + val = getattr(card, attr, None) + if callable(val): + continue + data[attr] = _serialise(val) + return data + + +def _serialise(obj): + """Recursively convert dataclass / nested objects to plain dicts.""" + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + if is_dataclass(obj) and not isinstance(obj, type): + return { + k: _serialise(v) + for k, v in asdict(obj).items() + if k != "sdk" + } + if isinstance(obj, list): + return [_serialise(i) for i in obj] + if isinstance(obj, dict): + return {k: _serialise(v) for k, v in obj.items()} + # Fallback: try dataclass-style attribute extraction + if hasattr(obj, "__dict__"): + return { + k: _serialise(v) + for k, v in obj.__dict__.items() + if k != "sdk" + } + return str(obj) + + +def save_image_with_metadata(image_bytes: bytes, metadata: dict, path: Path): + """Save a PNG image with JSON metadata embedded in a tEXt chunk.""" + img = Image.open(io.BytesIO(image_bytes)) + + png_info = PngImagePlugin.PngInfo() + png_info.add_text("pokemon_metadata", json.dumps(metadata, ensure_ascii=False)) + + path.parent.mkdir(parents=True, exist_ok=True) + img.save(str(path), "PNG", pnginfo=png_info) + + +def process_card(card_id: str, set_dir: Path) -> str | None: + """Fetch card data + image and save. Returns card description on success.""" + sdk = TCGdex(Language.EN) + card = sdk.card.getSync(card_id) + if not card: + return None + + resp = card.get_image(IMAGE_QUALITY, Extension.PNG) + image_bytes = resp.read() + + metadata = card_to_dict(card) + filename = f"{card.localId}.png" + save_image_with_metadata(image_bytes, metadata, set_dir / filename) + + return f"{card.name} ({card.id})" + + +def main(): + sdk = TCGdex(Language.EN) + + # 1. Get sets + all_sets = sdk.set.listSync() + if not all_sets: + print("No sets returned.") + return + + sets_to_process = all_sets[:MAX_SETS] if MAX_SETS else all_sets + print(f"Processing {len(sets_to_process)} / {len(all_sets)} sets\n") + + total_downloaded = 0 + + for si, set_resume in enumerate(sets_to_process, 1): + full_set = sdk.set.getSync(set_resume.id) + if not full_set or not full_set.cards: + print(f"[{si}] {set_resume.name}: no cards, skipping") + continue + + cards = full_set.cards[:MAX_CARDS_PER_SET] if MAX_CARDS_PER_SET else full_set.cards + card_total = full_set.cardCount.total if full_set.cardCount else len(full_set.cards) + print(f"[{si}/{len(sets_to_process)}] {set_resume.name} — {len(cards)}/{card_total} cards") + + set_dir = OUTPUT_DIR / set_resume.id + + with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool: + futures = { + pool.submit(process_card, cr.id, set_dir): cr.id + for cr in cards + } + for future in as_completed(futures): + card_id = futures[future] + try: + result = future.result() + if result: + total_downloaded += 1 + print(f" {result} ✓") + else: + print(f" {card_id}: skipped") + except Exception as e: + print(f" {card_id}: failed ({e})") + + print() + + print(f"Done — {total_downloaded} cards saved to {OUTPUT_DIR}") + + +if __name__ == "__main__": + main() + diff --git a/pokeball.png b/pokeball.png new file mode 100644 index 0000000000000000000000000000000000000000..01ca38c2142f205f98d4f8425fb25f5d78a8d39d GIT binary patch literal 3692 zcmV-y4wLbTP)m6%`daIyz@(XNQM}larG*H8s7xz33Pis;a7VbaYo&S0W-J zdwY8q7Z+e)VD=juDk>^)aB!X%7YqyxT3T8i9UVwWNL&*WuO=qd9vP@q780tE^bD6l$6l6tSzEVtX^cDvkc_3BCDdq}R(Xtn!;&5aVq z#^zwF-D>zApbes7ze_NK=xz-o-$OM(Z9HI*TVOD*_#Uzjn$vwPdJj0AHhmAAhv9(% zyC)nBeGgxSmY!@<7_@wokfX6}P45BQt7&<9Hqr@kuzC|Y+;PZnj-A!g$TGE}+hU{a zo7OXiXBqIDV}CU`wp`2SGIspyW5n^xY|!;=SGO|eH^-Joe$CSy2+rI&U*258_g{WI z|8Vl==HlDs+rt^&UVgi{d2{mN{73Hp2VR27CTnb+t9S1&4_``fc=`U_#}20*57RhN zkLvXH`clSi8FweQKPelV%8lH&toJzgx)X0*Ap9$D7Zxo~l_A?8|!XnA#T^#TmzInO*H?M^G9`{OX9`9N`nk zIQ&)8U1rgSYzBXGBA}S!OjPf4r6|i4x|~3f;%t4COk*a3DlsyhZ>|Xn6h}9-HfCUM z$~3_%X=LPvVh{)aAUk%Rq>?Q3@3P=G!-C;#8b6U;J6`&0w1j@mh@e0*7`-QZce)*y z&;;$B7%2{yDaLnXgzVA{e@%68pNR^R6vKcD9bda6IHd?)Wy$a5vyb{nF}xy%Fm*+c zG@~CgCOD6Ymnp`SFYd^oMt%s2)@Ei>sgSDXgyE4Vf}$$Y6pvgGq}=VSb?^}*K#>qs z2dYRd^`G-0DEKztb&-e}?H7K+E&MGnf< zWYi=ZYK#z2yu-U3Ju8O9{=&c zgL<%|kAUVh2m@$OH}wf(EqvpPATf#Y7kf?Ig-zraQDC4+4B|h)@OE_x;smLWrSKV; zAzBo5)2AqE%HLgE2J}v6#s)fXy3szG!VETGG=LV2AVe#M9znESpIhQW0>m&F!3?V2 z20vOV{7xZdYtSTWEg1FH(!!UNm|{&_(_o19v-1t?DTIh23iow3hT-7?CoXq+~{ielZmxrXh6Q zxSku#AzIiOYNr9k4CaF|L7cv&aLxQ$gslM?n=X?f9D#B20%eRO z_&2LHLJ^Ue@(9A%*H}o^68Z$zek7rYK@7x{7YD=oHB(D^&n8$Z{fpBe#SkQHcnN-} zhDRohu|9c&%O~sx)v=BYR`~V z#Cx$7PJ98YXNaf*o4XgB_H)RM)G6eRTQ(*(0QX@4a9GU5|U)(d=J*OA~ zMdY}q(9oD{YU9e@J}k3Ka21PkS;WpaQbdla0kNUUY8b$=Z&HVOYiMIc^)U%!8iZo) z0wYF*g_b78L9nVXeKX!Z%}5s+ zVnGZ>KWTOd>SZ}TzX;J`Q|~^DrY$`sG>S36a-puN6$zH^*L5Luc&h9V^mvup_74>P z6i2Mf&=MRskvs`tj+_~CN+jS4Yzb9&^ywI5aBAwvvjFB$xot!_q74`XU5kzCqq_Z| zdmb37?j)iQshNVeElVBfFo)`piH|usyTG5_AG{O?_yFh{NJ}Uj#mrLMICNc zUmfJT?T%C9A~{2C3T1z8yHm$9RK2_vl>LcnzdI~zVmJ{RV4T$zem_m^u!=DTmRRlU zi{aWU#vA~Mn{(=~Zp^}*SQk)IJ1nY1kNqxEmq%aK*+yL8{z8eHQ3`Mj>FoZkS>e4S!EsRd1h%KZS zxd5Luz2O8itiU?av2u9e5Q52haFCEp%MF2Qa>WEt4B?)ah`~6n#mS zQt5tL8q(|eBw!RyVpCd2M+jMC0~2q02h2o^Am>a^U9E<^GtX6%KM{ayu`NNu&_wX4 z@|ZO;=&hSF)VQ#nkuhON4dWaRUc>yYv|VB zu|{)n2g>MT4W~dtTCbotrnGMRgkhKWM~RS-;-x~*dV-` zrGhXsX6Z7VxAyc{I5acF#lMD+Bu-XCPN5do@Scw8J+yd}EXQ}j7Jzn?Nc|WaK`iec zp3DSOi1yP;Rtp(&jhx&Nt_9%E5No}aGlJKhV`oEW0(hh-k6*IP1>#g-rte8%fR!R5 z4rdsZ8Z0`%4dxq10W(TH=VR$>!HiAG)q4ayh-QYeGNrL?AB5cJxA2Q*rb6BEZ=Zc2 zMsG^qL`1Mszub@tEALSY8Qhbp4UjR}jfUtt`Ep zWh)cW4-;~)E@OgxhdVnTb;-)qDPOeYepRMVd^s}K2ulPjYviME8KnN`r=0gPaZ;BE zMnlQjy$q}X=`%TSd@+(G36)i=*gc;kf#ey>UDuBIf?$m(Bz{0fMh-{(4{*B&{!Fk! z$Np`@n)u328p>Txf0b!f)!L6 zyM#=P%mVbHF$X$tV1VdR0jsFaVu5U(#TgRPyZMeCTaNQG=lYDFyo=Ia9x0TRV#70= zw14A4|F+YP=i6lt*+{J!?VH-wXf}OOYm(M>cF)E`q@OSEn(Q+sGkR-Pi3|FK95*m= zxsHfqU5$)tYh$z{3y_TuEX71IoYJ7pcm9*sw&bMH5-i`U)Cc3(&(XBZP|8~Qn9RWd zc-UfZs6YE0yIbX+#KJdv<(K#1$2SNBqM$kf2DK~_~?TJ zOg)sgq`n= EPOCHS:\n", + " print(f\"Already trained {start_epoch} epochs (requested {EPOCHS}). Increase EPOCHS to continue.\")\n", + "else:\n", + " print(f\"Training epochs {start_epoch + 1} to {EPOCHS}...\")\n", + "\n", + "def encode_prompt(text_encoder, input_ids, attention_mask):\n", + " with torch.no_grad():\n", + " emb = text_encoder(input_ids, attention_mask=attention_mask)[0]\n", + " return emb\n", + "\n", + "for epoch in range(start_epoch, EPOCHS):\n", + " start = time.time()\n", + "\n", + " # Train\n", + " unet.train()\n", + " running_loss = 0.0\n", + " smooth_loss = None # exponential moving average\n", + "\n", + " pbar = tqdm(train_loader, desc=f\"[{epoch+1}/{EPOCHS}] Train\", leave=True,\n", + " bar_format=\"{l_bar}{bar:30}{r_bar}\", dynamic_ncols=True)\n", + "\n", + " for step, batch in enumerate(pbar):\n", + " pixel_values = batch[\"pixel_values\"].to(device)\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + "\n", + " with torch.no_grad():\n", + " latents = vae.encode(pixel_values).latent_dist.sample() * 0.18215\n", + "\n", + " encoder_hidden_states = encode_prompt(text_encoder, input_ids, attention_mask)\n", + " noise = torch.randn_like(latents)\n", + " bsz = latents.shape[0]\n", + " timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=device).long()\n", + " noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)\n", + "\n", + " pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample\n", + " loss = torch.nn.functional.mse_loss(pred.float(), noise.float(), reduction=\"mean\")\n", + "\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " torch.nn.utils.clip_grad_norm_(unet.parameters(), 1.0)\n", + " optimizer.step()\n", + "\n", + " running_loss += loss.item() * bsz\n", + "\n", + " # Exponential moving average for smoother loss display\n", + " smooth_loss = loss.item() if smooth_loss is None else 0.98 * smooth_loss + 0.02 * loss.item()\n", + "\n", + " pbar.set_postfix(ordered_dict={\n", + " \"loss\": f\"{loss.item():.4f}\",\n", + " \"ema\": f\"{smooth_loss:.4f}\",\n", + " \"avg\": f\"{running_loss / ((step + 1) * bsz):.4f}\",\n", + " })\n", + "\n", + " train_loss = running_loss / len(train_dataset)\n", + " train_losses.append(train_loss)\n", + "\n", + " # Validate\n", + " unet.eval()\n", + " running_loss = 0.0\n", + "\n", + " with torch.no_grad():\n", + " vbar = tqdm(val_loader, desc=f\"[{epoch+1}/{EPOCHS}] Val \", leave=True,\n", + " bar_format=\"{l_bar}{bar:30}{r_bar}\", dynamic_ncols=True)\n", + "\n", + " for step, batch in enumerate(vbar):\n", + " pixel_values = batch[\"pixel_values\"].to(device)\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + "\n", + " latents = vae.encode(pixel_values).latent_dist.sample() * 0.18215\n", + " encoder_hidden_states = encode_prompt(text_encoder, input_ids, attention_mask)\n", + " noise = torch.randn_like(latents)\n", + " bsz = latents.shape[0]\n", + " timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=device).long()\n", + " noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)\n", + "\n", + " pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample\n", + " loss = torch.nn.functional.mse_loss(pred.float(), noise.float(), reduction=\"mean\")\n", + " running_loss += loss.item() * bsz\n", + "\n", + " vbar.set_postfix(ordered_dict={\n", + " \"loss\": f\"{loss.item():.4f}\",\n", + " \"avg\": f\"{running_loss / ((step + 1) * bsz):.4f}\",\n", + " })\n", + "\n", + " val_loss = running_loss / len(val_dataset)\n", + " val_losses.append(val_loss)\n", + "\n", + " elapsed = time.time() - start\n", + " mins, secs = divmod(int(elapsed), 60)\n", + " print(f\"\\n{'='*60}\")\n", + " print(f\" Epoch {epoch+1}/{EPOCHS} complete — {mins}m{secs:02d}s\")\n", + " print(f\" Train loss : {train_loss:.4f}\")\n", + " print(f\" Val loss : {val_loss:.4f}\")\n", + " print(f\"{'='*60}\\n\")\n", + "\n", + " # Save checkpoint after each epoch\n", + " Path(CHECKPOINT_DIR).mkdir(exist_ok=True)\n", + " torch.save({\n", + " \"epoch\": epoch + 1,\n", + " \"unet_state_dict\": unet.state_dict(),\n", + " \"optimizer_state_dict\": optimizer.state_dict(),\n", + " \"train_losses\": train_losses,\n", + " \"val_losses\": val_losses,\n", + " }, CHECKPOINT_PATH)\n", + " print(f\" Checkpoint saved (epoch {epoch + 1})\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxYAAAGGCAYAAADmRxfNAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAU5lJREFUeJzt3XtcVVXi///3AeSAHg+iIHhBEXFMTbygInbRMbxQmZqT5piiY+OlspSxScbyUtNHS2bS1CGnPpk2Hy9ZaVdFhZjUyAuKEZqVeWmUi6ZchATl7N8f/jzfzoAGHvCIvp6Px37EXmftvdY+W22/WXvtbTIMwxAAAAAAOMHN1R0AAAAAUPsRLAAAAAA4jWABAAAAwGkECwAAAABOI1gAAAAAcBrBAgAAAIDTCBYAAAAAnEawAAAAAOA0ggUAAAAApxEsAMDFxo4dq+Dg4Gvads6cOTKZTNXboVouJSVFJpNJKSkp9rLKfsdHjx6VyWTSW2+9Va19Cg4O1tixY6t1nwBwoyFYAMAVmEymSi2/vIC91dhsNsXHx6tNmzby9vZW69atNXnyZJ07d65S24eFhalFixYyDOOKde644w4FBATo4sWL1dXtGvHFF19ozpw5ysvLc3VX7N566y2ZTCbt2bPH1V0BcAvwcHUHAOBG9fbbbzusr1y5Ulu2bClX3q5dO6faef3112Wz2a5p22effVYzZsxwqn1nLFq0SE8//bSGDBmip59+WseOHdPq1av1zDPPyGKx/Or2o0aN0owZM7Rt2zbdfffd5T4/evSoUlNT9cQTT8jD49r/l+XMd1xZX3zxhebOnauxY8eqQYMGDp8dOnRIbm78Lg/AzY1gAQBX8Mgjjzisf/nll9qyZUu58v9WXFysunXrVrqdOnXqXFP/JMnDw8OpC25nrVmzRh06dND7779vvyXrhRdeqPRF/O9//3vFxcVp1apVFQaL1atXyzAMjRo1yql+OvMdVwez2ezS9gHgeuDXJwDghD59+uj2229XWlqa7r77btWtW1d/+ctfJEkffPCB7rvvPjVt2lRms1mtW7fWCy+8oLKyMod9/Pf9/5fv84+Pj9c///lPtW7dWmazWd27d9fu3bsdtq1ojoXJZNITTzyhDRs26Pbbb5fZbFaHDh20adOmcv1PSUlRt27d5OXlpdatW2vZsmVVmrfh5uYmm83mUN/Nza3SYScoKEh333233n33XV24cKHc56tWrVLr1q0VERGhY8eO6bHHHlPbtm3l7e2tRo0a6aGHHtLRo0d/tZ2K5ljk5eVp7Nix8vHxUYMGDRQTE1PhbUxfffWVxo4dq5CQEHl5eSkwMFB/+MMf9NNPP9nrzJkzR08//bQkqVWrVvbb5C73raI5Fj/88IMeeughNWzYUHXr1lXPnj31ySefONS5PF/knXfe0YsvvqjmzZvLy8tL99xzj77//vtfPe7K2rdvn6Kjo2W1WmWxWHTPPffoyy+/dKhz4cIFzZ07V23atJGXl5caNWqkO++8U1u2bLHXyc7O1rhx49S8eXOZzWY1adJEgwcPrtQ5AlD7MWIBAE766aefFB0drYcffliPPPKIAgICJF26v91isSg2NlYWi0XJycmaNWuWCgoKtGDBgl/d76pVq1RYWKiJEyfKZDLp5Zdf1oMPPqgffvjhV38Dv337dr3//vt67LHHVL9+fb366qsaNmyYjh8/rkaNGkm6dDE5cOBANWnSRHPnzlVZWZmef/55+fv7V/rYx40bp4kTJ2rZsmWaOHFipbf7pVGjRmnChAlKTEzU/fffby/PyMjQ119/rVmzZkmSdu/erS+++EIPP/ywmjdvrqNHjyohIUF9+vTRgQMHqjRKZBiGBg8erO3bt2vSpElq166d1q9fr5iYmHJ1t2zZoh9++EHjxo1TYGCgMjMz9c9//lOZmZn68ssvZTKZ9OCDD+rbb7/V6tWr9corr8jPz0+Srvhd5uTkqFevXiouLtaTTz6pRo0aacWKFXrggQf07rvvaujQoQ7158+fLzc3N02fPl35+fl6+eWXNWrUKO3cubPSx3wlmZmZuuuuu2S1WvXnP/9ZderU0bJly9SnTx/9+9//VkREhKRL4WnevHl69NFH1aNHDxUUFGjPnj3au3ev+vXrJ0kaNmyYMjMzNWXKFAUHBys3N1dbtmzR8ePHr/kBBQBqEQMAUCmPP/648d//bPbu3duQZLz22mvl6hcXF5crmzhxolG3bl3j/Pnz9rKYmBijZcuW9vUjR44YkoxGjRoZZ86csZd/8MEHhiTjo48+spfNnj27XJ8kGZ6ensb3339vL9u/f78hyVi8eLG9bNCgQUbdunWNEydO2Mu+++47w8PDo9w+r2TGjBmGp6en4e7ubrz//vuV2ua/nTlzxjCbzcbIkSPL7VuScejQIcMwKv4+U1NTDUnGypUr7WWfffaZIcn47LPP7GX//R1v2LDBkGS8/PLL9rKLFy8ad911lyHJWL58ub28onZXr15tSDI+//xze9mCBQsMScaRI0fK1W/ZsqURExNjX586daohydi2bZu9rLCw0GjVqpURHBxslJWVORxLu3btjJKSEnvdRYsWGZKMjIyMcm390vLlyw1Jxu7du69YZ8iQIYanp6dx+PBhe9nJkyeN+vXrG3fffbe9rFOnTsZ99913xf2cPXvWkGQsWLDgqn0CcPPiVigAcJLZbNa4cePKlXt7e9t/Liws1OnTp3XXXXepuLhY33zzza/ud8SIEfL19bWv33XXXZIu3ULza6KiotS6dWv7elhYmKxWq33bsrIybd26VUOGDFHTpk3t9UJDQxUdHf2r+5ekV199VX//+9+1Y8cOjRw5Ug8//LA2b97sUMdsNuu555676n58fX1177336sMPP1RRUZGkSyMKa9asUbdu3fSb3/xGkuP3eeHCBf30008KDQ1VgwYNtHfv3kr1+bJPP/1UHh4emjx5sr3M3d1dU6ZMKVf3l+2eP39ep0+fVs+ePSWpyu3+sv0ePXrozjvvtJdZLBZNmDBBR48e1YEDBxzqjxs3Tp6envb1qvxZuJqysjJt3rxZQ4YMUUhIiL28SZMm+v3vf6/t27eroKBAktSgQQNlZmbqu+++q3Bf3t7e8vT0VEpKis6ePetUvwDUTgQLAHBSs2bNHC76LsvMzNTQoUPl4+Mjq9Uqf39/+8Tv/Pz8X91vixYtHNYvh4zKXLT997aXt7+8bW5urn7++WeFhoaWq1dR2X/7+eefNXv2bD366KPq1q2bli9frr59+2ro0KHavn27JOm7775TaWmp/Vaaqxk1apSKior0wQcfSLr0hKWjR486TNr++eefNWvWLAUFBclsNsvPz0/+/v7Ky8ur1Pf5S8eOHVOTJk3KPbmqbdu25eqeOXNGTz31lAICAuTt7S1/f3+1atVKUuXO45Xar6ity08YO3bsmEO5M38WrubUqVMqLi6+Yl9sNpt+/PFHSdLzzz+vvLw8/eY3v1HHjh319NNP66uvvrLXN5vNeumll7Rx40YFBATo7rvv1ssvv6zs7Gyn+gig9iBYAICTfvkb7cvy8vLUu3dv7d+/X88//7w++ugjbdmyRS+99JIkVeqpSe7u7hWWG1d550N1bFsZBw8eVF5env039x4eHnr33Xd1++2367777tPevXv1z3/+U40bN7bff381999/v3x8fLRq1SpJl+aXuLu76+GHH7bXmTJlil588UUNHz5c77zzjjZv3qwtW7aoUaNGNfoo2eHDh+v111/XpEmT9P7772vz5s32ifA1/Qjby2r6fFbG3XffrcOHD+vNN9/U7bffrjfeeENdu3bVG2+8Ya8zdepUffvtt5o3b568vLz03HPPqV27dtq3b9916ycA12HyNgDUgJSUFP300096//33HR6jeuTIERf26v9p3LixvLy8KnyyUGWeNnT5KVCXf5stSfXq1dOnn36qO++8UwMGDND58+f117/+tVKPWjWbzfrd736nlStXKicnR+vWrVPfvn0VGBhor/Puu+8qJiZGf/vb3+xl58+fv6YX0rVs2VJJSUk6d+6cw6jFoUOHHOqdPXtWSUlJmjt3rn0SuaQKbweqyhvQW7ZsWa4tSfZb5Fq2bFnpfTnD399fdevWvWJf3NzcFBQUZC9r2LChxo0bp3HjxuncuXO6++67NWfOHD366KP2Oq1bt9af/vQn/elPf9J3332nzp07629/+5v+9a9/XZdjAuA6jFgAQA24/BvmX/5GubS0VP/4xz9c1SUH7u7uioqK0oYNG3Ty5El7+ffff6+NGzf+6vYdO3ZUQECAlixZotzcXHt5o0aNtHz5cp0+fVo///yzBg0aVOk+jRo1ShcuXNDEiRN16tSpcu+ucHd3L/cb+sWLF5d7fG9l3Hvvvbp48aISEhLsZWVlZVq8eHG5NqXyIwMLFy4st8969epJUqWCzr333qtdu3YpNTXVXlZUVKR//vOfCg4OVvv27St7KE5xd3dX//799cEHHzg8EjYnJ0erVq3SnXfeKavVKkkOj9eVLs0JCQ0NVUlJiaRL7285f/68Q53WrVurfv369joAbm6MWABADejVq5d8fX0VExOjJ598UiaTSW+//fZ1vXXl18yZM0ebN2/WHXfcocmTJ6usrExLlizR7bffrvT09Ktu6+HhoSVLlmjEiBHq2LGjJk6cqJYtW+rgwYN688031bFjR/3nP//R4MGDtWPHDvvF6dX07t1bzZs31wcffCBvb289+OCDDp/ff//9evvtt+Xj46P27dsrNTVVW7dutT8+tyoGDRqkO+64QzNmzNDRo0fVvn17vf/+++XmTFitVvtcgQsXLqhZs2bavHlzhSNP4eHhkqSZM2fq4YcfVp06dTRo0CB74PilGTNmaPXq1YqOjtaTTz6phg0basWKFTpy5Ijee++9an9L95tvvlnhe0yeeuop/fWvf9WWLVt055136rHHHpOHh4eWLVumkpISvfzyy/a67du3V58+fRQeHq6GDRtqz549evfdd/XEE09Ikr799lvdc889Gj58uNq3by8PDw+tX79eOTk5Dre0Abh5ESwAoAY0atRIH3/8sf70pz/p2Wefla+vrx555BHdc889GjBggKu7J+nShfDGjRs1ffp0PffccwoKCtLzzz+vgwcPVuqpVb/73e+UkpKiF198UYsWLVJJSYnatGmjP//5z3rqqaf073//W/fdd58eeughffLJJ7/60jw3NzeNHDlSCxYs0KBBg1S/fn2HzxctWiR3d3f93//9n86fP6877rhDW7duvabv083NTR9++KGmTp2qf/3rXzKZTHrggQf0t7/9TV26dHGou2rVKk2ZMkVLly6VYRjq37+/Nm7c6PA0LUnq3r27XnjhBb322mvatGmTbDabjhw5UmGwCAgI0BdffKFnnnlGixcv1vnz5xUWFqaPPvpI9913X5WP59f8cmTml8aOHasOHTpo27ZtiouL07x582Sz2RQREaF//etfDhPvn3zySX344YfavHmzSkpK1LJlS/31r3+1vxgwKChII0eOVFJSkt5++215eHjotttu0zvvvKNhw4ZV+zEBuPGYjBvp12cAAJcbMmTIVR8rCgBARZhjAQC3sJ9//tlh/bvvvtOnn36qPn36uKZDAIBaixELALiFNWnSRGPHjlVISIiOHTumhIQElZSUaN++fWrTpo2ruwcAqEWYYwEAt7CBAwdq9erVys7OltlsVmRkpP7nf/6HUAEAqDJGLAAAAAA4jTkWAAAAAJxGsAAAAADgNOZY1CCbzaaTJ0+qfv36MplMru4OAAAAUCWGYaiwsFBNmzb91Zd3Eixq0MmTJxUUFOTqbgAAAABO+fHHH9W8efOr1nF5sFi6dKkWLFig7OxsderUSYsXL1aPHj0qrJuZmalZs2YpLS1Nx44d0yuvvKKpU6c61ElISFBCQoKOHj0qSerQoYNmzZql6Ohoe53Dhw9r+vTp2r59u0pKSjRw4EAtXrxYAQEB9jrffvutnn76ae3YsUOlpaUKCwvTCy+8oN/+9reVPrbLb4398ccfZbVaK70dAAAAcCMoKChQUFCQ/br2alwaLNauXavY2Fi99tprioiI0MKFCzVgwAAdOnRIjRs3Lle/uLhYISEheuihhzRt2rQK99m8eXPNnz9fbdq0kWEYWrFihQYPHqx9+/apQ4cOKioqUv/+/dWpUyclJydLkp577jkNGjRIX375pX2I5/7771ebNm2UnJwsb29vLVy4UPfff78OHz6swMDASh3f5dufrFYrwQIAAAC1VmVu63fp42YjIiLUvXt3LVmyRNKlOQlBQUGaMmWKZsyYcdVtg4ODNXXq1HIjFhVp2LChFixYoPHjx2vz5s2Kjo7W2bNn7Rf7+fn58vX11ebNmxUVFaXTp0/L399fn3/+ue666y5JUmFhoaxWq7Zs2aKoqKhKHV9BQYF8fHyUn59PsAAAAECtU5XrWZc9Faq0tFRpaWkOF+lubm6KiopSampqtbRRVlamNWvWqKioSJGRkZKkkpISmUwmmc1mez0vLy+5ublp+/btkqRGjRqpbdu2WrlypYqKinTx4kUtW7ZMjRs3Vnh4+BXbKykpUUFBgcMCAAAA3ApcFixOnz6tsrIyh3kNkhQQEKDs7Gyn9p2RkSGLxSKz2axJkyZp/fr1at++vSSpZ8+eqlevnp555hkVFxerqKhI06dPV1lZmbKysiRdGurZunWr9u3bp/r168vLy0t///vftWnTJvn6+l6x3Xnz5snHx8e+MHEbAAAAtwqXT96uCW3btlV6erry8/P17rvvKiYmRv/+97/Vvn17+fv7a926dZo8ebJeffVVubm5aeTIkeratat9foVhGHr88cfVuHFjbdu2Td7e3nrjjTc0aNAg7d69W02aNKmw3bi4OMXGxtrXL092AQAAQM2w2WwqLS11dTdqrTp16sjd3b1a9uWyYOHn5yd3d3fl5OQ4lOfk5FR6cvSVeHp6KjQ0VJIUHh6u3bt3a9GiRVq2bJkkqX///jp8+LBOnz4tDw8PNWjQQIGBgQoJCZEkJScn6+OPP3aYh/GPf/xDW7Zs0YoVK644/8NsNjvcYgUAAICaU1paqiNHjshms7m6K7Xa5WthZ9+75rJg4enpqfDwcCUlJWnIkCGSLiXOpKQkPfHEE9Xals1mU0lJSblyPz8/SZeCRG5urh544AFJl54+JancS0Dc3Nz4gwsAAHADMAxDWVlZcnd3V1BQ0K++vA3lGYah4uJi5ebmStIV78qpLJfeChUbG6uYmBh169ZNPXr00MKFC1VUVKRx48ZJksaMGaNmzZpp3rx5ki6l0gMHDth/PnHihNLT02WxWOwjFHFxcYqOjlaLFi1UWFioVatWKSUlRYmJifZ2ly9frnbt2snf31+pqal66qmnNG3aNLVt21aSFBkZKV9fX8XExGjWrFny9vbW66+/riNHjui+++67nl8RAAAAKnDx4kUVFxeradOmqlu3rqu7U2t5e3tLknJzc9W4cWOnbotyabAYMWKETp06pVmzZik7O1udO3fWpk2b7BO6jx8/7pA+T548qS5dutjX4+PjFR8fr969eyslJUXSpS9lzJgxysrKko+Pj8LCwpSYmKh+/frZtzt06JDi4uJ05swZBQcHa+bMmQ7vxfDz89OmTZs0c+ZM9e3bVxcuXFCHDh30wQcfqFOnTjX8rQAAAODXlJWVSbp0FwycczmYXbhwwalg4dL3WNzseI8FAABAzTh//ryOHDmiVq1aycvLy9XdqdWu9l3WivdYAAAAALh5ECwAAACAWiw4OFgLFy50dTcIFgAAAMD1YDKZrrrMmTPnmva7e/duTZgwoXo7ew1uyhfkAQAAADearKws+89r167VrFmzdOjQIXuZxWKx/2wYhsrKyuTh8euX6/7+/tXb0WvEiAUAAABwHQQGBtoXHx8fmUwm+/o333yj+vXra+PGjQoPD5fZbNb27dt1+PBhDR48WAEBAbJYLOrevbu2bt3qsN//vhXKZDLpjTfe0NChQ1W3bl21adNGH374YY0fH8ECAAAAtZ5hGCouveiSpTofsjpjxgzNnz9fBw8eVFhYmM6dO6d7771XSUlJ2rdvnwYOHKhBgwbp+PHjV93P3LlzNXz4cH311Ve69957NWrUKJ05c6ba+lkRboUCAABArffzhTK1n5X46xVrwIHnB6iuZ/VcVj///PMO719r2LChw3vUXnjhBa1fv14ffvihnnjiiSvuZ+zYsRo5cqQk6X/+53/06quvateuXRo4cGC19LMijFgAAAAAN4hu3bo5rJ87d07Tp09Xu3bt1KBBA1ksFh08ePBXRyzCwsLsP9erV09Wq1W5ubk10ufLGLEAAABAreddx10Hnh/gsrarS7169RzWp0+fri1btig+Pl6hoaHy9vbW7373O5WWll51P3Xq1HFYN5lMstls1dbPihAsAAAAUOuZTKZqux3pRrJjxw6NHTtWQ4cOlXRpBOPo0aOu7dQVcCsUAAAAcINq06aN3n//faWnp2v//v36/e9/X+MjD9eKYAEAAADcoP7+97/L19dXvXr10qBBgzRgwAB17drV1d2qkMmozudjwUFBQYF8fHyUn58vq9Xq6u4AAADcNM6fP68jR46oVatW8vLycnV3arWrfZdVuZ5lxAIAAACA0wgWAAAAAJxGsAAAAADgNIIFAAAAAKcRLAAAAAA4jWABAAAAwGkECwAAAABOI1gAAAAAcBrBAgAAAIDTCBYAAABALdGnTx9NnTrV1d2oEMECAAAAuA4GDRqkgQMHVvjZtm3bZDKZ9NVXX13nXlUfggUAAABwHYwfP15btmzRf/7zn3KfLV++XN26dVNYWJgLelY9CBYAAADAdXD//ffL399fb731lkP5uXPntG7dOg0ZMkQjR45Us2bNVLduXXXs2FGrV692TWevAcECAAAAtZ9hSKVFrlkMo1Jd9PDw0JgxY/TWW2/J+MU269atU1lZmR555BGFh4frk08+0ddff60JEyZo9OjR2rVrV019a9XKw9UdWLp0qRYsWKDs7Gx16tRJixcvVo8ePSqsm5mZqVmzZiktLU3Hjh3TK6+8Um7ySkJCghISEnT06FFJUocOHTRr1ixFR0fb6xw+fFjTp0/X9u3bVVJSooEDB2rx4sUKCAhw2Ncnn3yi559/Xl999ZW8vLzUu3dvbdiwoToPHwAAANXhQrH0P01d0/ZfTkqe9SpV9Q9/+IMWLFigf//73+rTp4+kS7dBDRs2TC1bttT06dPtdadMmaLExES98847V7w+vpG4dMRi7dq1io2N1ezZs7V371516tRJAwYMUG5uboX1i4uLFRISovnz5yswMLDCOs2bN9f8+fOVlpamPXv2qG/fvho8eLAyMzMlSUVFRerfv79MJpOSk5O1Y8cOlZaWatCgQbLZbPb9vPfeexo9erTGjRun/fv3a8eOHfr9739f/V8CAAAAbhm33XabevXqpTfffFOS9P3332vbtm0aP368ysrK9MILL6hjx45q2LChLBaLEhMTdfz4cRf3unJMhlHJsZsaEBERoe7du2vJkiWSJJvNpqCgIE2ZMkUzZsy46rbBwcGaOnVqpR631bBhQy1YsEDjx4/X5s2bFR0drbNnz8pqtUqS8vPz5evrq82bNysqKkoXL15UcHCw5s6dq/Hjx1/z8RUUFMjHx0f5+fn2tgAAAOC88+fP68iRI2rVqpW8vLwu3Y50odg1nalTVzKZKl39zTff1JQpU5Sdna358+dr7dq1+u677/TSSy8pPj5eCxcuVMeOHVWvXj1NnTpVHh4e9rtm+vTpo86dO2vhwoXV1v1y3+UvVOV61mUjFqWlpUpLS1NUVNT/64ybm6KiopSamlotbZSVlWnNmjUqKipSZGSkJKmkpEQmk0lms9lez8vLS25ubtq+fbskae/evTpx4oTc3NzUpUsXNWnSRNHR0fr666+rpV8AAACoZibTpduRXLFUIVRI0vDhw+Xm5qZVq1Zp5cqV+sMf/iCTyaQdO3Zo8ODBeuSRR9SpUyeFhITo22+/raEvrPq5LFicPn1aZWVl5eY1BAQEKDs726l9Z2RkyGKxyGw2a9KkSVq/fr3at28vSerZs6fq1aunZ555RsXFxSoqKtL06dNVVlamrKwsSdIPP/wgSZozZ46effZZffzxx/L19VWfPn105syZK7ZbUlKigoIChwUAAAD4JYvFohEjRiguLk5ZWVkaO3asJKlNmzbasmWLvvjiCx08eFATJ05UTk6OaztbBTflU6Hatm2r9PR07dy5U5MnT1ZMTIwOHDggSfL399e6dev00UcfyWKxyMfHR3l5eeratavc3C59HZfnWsycOVPDhg1TeHi4li9fLpPJpHXr1l2x3Xnz5snHx8e+BAUF1fzBAgAAoNYZP368zp49qwEDBqhp00uTzp999ll17dpVAwYMUJ8+fRQYGKghQ4a4tqNV4LKnQvn5+cnd3b1cCsvJybnixOzK8vT0VGhoqCQpPDxcu3fv1qJFi7Rs2TJJUv/+/XX48GGdPn1aHh4eatCggQIDAxUSEiJJatKkiSTZRzkkyWw2KyQk5KqTZ+Li4hQbG2tfLygoIFwAAACgnMjISP33VOeGDRv+6hNIU1JSaq5TTnLZiIWnp6fCw8OVlJRkL7PZbEpKSrLPh6guNptNJSUl5cr9/PzUoEEDJScnKzc3Vw888ICkS2HEbDbr0KFD9roXLlzQ0aNH1bJlyyu2YzabZbVaHRYAAADgVuDS91jExsYqJiZG3bp1U48ePbRw4UIVFRVp3LhxkqQxY8aoWbNmmjdvnqRLE74v39JUWlqqEydOKD09XRaLxT5CERcXp+joaLVo0UKFhYVatWqVUlJSlJiYaG93+fLlateunfz9/ZWamqqnnnpK06ZNU9u2bSVJVqtVkyZN0uzZsxUUFKSWLVtqwYIFkqSHHnroun0/AAAAQG3h0mAxYsQInTp1SrNmzVJ2drY6d+6sTZs22Sd0Hz9+3D7vQZJOnjypLl262Nfj4+MVHx+v3r1724eFcnNzNWbMGGVlZcnHx0dhYWFKTExUv3797NsdOnRIcXFxOnPmjIKDgzVz5kxNmzbNoW8LFiyQh4eHRo8erZ9//lkRERFKTk6Wr69vDX4jAAAAQO3k0vdY3Ox4jwUAAEDNuNq7F1A1tf49FgAAAABuHgQLAAAA1FrcfOO8y69acJZL51gAAAAA16JOnToymUw6deqU/P39Zari269xKZSVlpbq1KlTcnNzk6enp1P7I1gAAACg1nF3d1fz5s31n//8R0ePHnV1d2q1unXrqkWLFg4PTboWBAsAAADUShaLRW3atNGFCxdc3ZVay93dXR4eHtUy4kOwAAAAQK3l7u4ud3d3V3cDYvI2AAAAgGpAsAAAAADgNIIFAAAAAKcRLAAAAAA4jWABAAAAwGkECwAAAABOI1gAAAAAcBrBAgAAAIDTCBYAAAAAnEawAAAAAOA0ggUAAAAApxEsAAAAADiNYAEAAADAaQQLAAAAAE4jWAAAAABwGsECAAAAgNMIFgAAAACcRrAAAAAA4DSCBQAAAACnESwAAAAAOI1gAQAAAMBpBAsAAAAATrshgsXSpUsVHBwsLy8vRUREaNeuXVesm5mZqWHDhik4OFgmk0kLFy4sVychIUFhYWGyWq2yWq2KjIzUxo0bHeocPnxYQ4cOlb+/v6xWq4YPH66cnJwK2ywpKVHnzp1lMpmUnp7uzKECAAAANyWXB4u1a9cqNjZWs2fP1t69e9WpUycNGDBAubm5FdYvLi5WSEiI5s+fr8DAwArrNG/eXPPnz1daWpr27Nmjvn37avDgwcrMzJQkFRUVqX///jKZTEpOTtaOHTtUWlqqQYMGyWazldvfn//8ZzVt2rT6DhoAAAC4yZgMwzBc2YGIiAh1795dS5YskSTZbDYFBQVpypQpmjFjxlW3DQ4O1tSpUzV16tRfbadhw4ZasGCBxo8fr82bNys6Olpnz56V1WqVJOXn58vX11ebN29WVFSUfbuNGzcqNjZW7733njp06KB9+/apc+fOlTq2goIC+fj4KD8/394OAAAAUFtU5XrWpSMWpaWlSktLc7iQd3NzU1RUlFJTU6uljbKyMq1Zs0ZFRUWKjIyUdOnWJpPJJLPZbK/n5eUlNzc3bd++3V6Wk5OjP/7xj3r77bdVt27dX22rpKREBQUFDgsAAABwK3BpsDh9+rTKysoUEBDgUB4QEKDs7Gyn9p2RkSGLxSKz2axJkyZp/fr1at++vSSpZ8+eqlevnp555hkVFxerqKhI06dPV1lZmbKysiRJhmFo7NixmjRpkrp161apNufNmycfHx/7EhQU5NQxAAAAALWFy+dY1JS2bdsqPT1dO3fu1OTJkxUTE6MDBw5Ikvz9/bVu3Tp99NFHslgs8vHxUV5enrp27So3t0tfyeLFi1VYWKi4uLhKtxkXF6f8/Hz78uOPP9bIsQEAAAA3Gg9XNu7n5yd3d/dyT2PKycm54sTsyvL09FRoaKgkKTw8XLt379aiRYu0bNkySVL//v11+PBhnT59Wh4eHmrQoIECAwMVEhIiSUpOTlZqaqrD7VKS1K1bN40aNUorVqwo16bZbC5XHwAAALgVuHTEwtPTU+Hh4UpKSrKX2Ww2JSUl2edDVBebzaaSkpJy5X5+fmrQoIGSk5OVm5urBx54QJL06quvav/+/UpPT1d6ero+/fRTSZeeYvXiiy9Wa98AAACA2s6lIxaSFBsbq5iYGHXr1k09evTQwoULVVRUpHHjxkmSxowZo2bNmmnevHmSLk34vnxLU2lpqU6cOKH09HRZLBb7CEVcXJyio6PVokULFRYWatWqVUpJSVFiYqK93eXLl6tdu3by9/dXamqqnnrqKU2bNk1t27aVJLVo0cKhnxaLRZLUunVrNW/evGa/FAAAAKCWcXmwGDFihE6dOqVZs2YpOztbnTt31qZNm+wTuo8fP26f9yBJJ0+eVJcuXezr8fHxio+PV+/evZWSkiJJys3N1ZgxY5SVlSUfHx+FhYUpMTFR/fr1s2936NAhxcXF6cyZMwoODtbMmTM1bdq063PQAAAAwE3G5e+xuJnxHgsAAADUZrXmPRYAAAAAbg4ECwAAAABOI1gAAAAAcBrBAgAAAIDTCBYAAAAAnEawAAAAAOA0ggUAAAAApxEsAAAAADiNYAEAAADAaQQLAAAAAE4jWAAAAABwGsECAAAAgNMIFgAAAACcRrAAAAAA4DSCBQAAAACnESwAAAAAOI1gAQAAAMBpBAsAAAAATiNYAAAAAHAawQIAAACA0wgWAAAAAJxGsAAAAADgNIIFAAAAAKcRLAAAAAA4jWABAAAAwGkECwAAAABOI1gAAAAAcNoNESyWLl2q4OBgeXl5KSIiQrt27bpi3czMTA0bNkzBwcEymUxauHBhuToJCQkKCwuT1WqV1WpVZGSkNm7c6FDn8OHDGjp0qPz9/WW1WjV8+HDl5OTYPz969KjGjx+vVq1aydvbW61bt9bs2bNVWlpabccNAAAA3CxcHizWrl2r2NhYzZ49W3v37lWnTp00YMAA5ebmVli/uLhYISEhmj9/vgIDAyus07x5c82fP19paWnas2eP+vbtq8GDByszM1OSVFRUpP79+8tkMik5OVk7duxQaWmpBg0aJJvNJkn65ptvZLPZtGzZMmVmZuqVV17Ra6+9pr/85S8180UAAAAAtZjJMAzDlR2IiIhQ9+7dtWTJEkmSzWZTUFCQpkyZohkzZlx12+DgYE2dOlVTp0791XYaNmyoBQsWaPz48dq8ebOio6N19uxZWa1WSVJ+fr58fX21efNmRUVFVbiPBQsWKCEhQT/88EOljq2goEA+Pj7Kz8+3twMAAADUFlW5nnXpiEVpaanS0tIcLuTd3NwUFRWl1NTUammjrKxMa9asUVFRkSIjIyVJJSUlMplMMpvN9npeXl5yc3PT9u3br7iv/Px8NWzYsFr6BQAAANxMXBosTp8+rbKyMgUEBDiUBwQEKDs726l9Z2RkyGKxyGw2a9KkSVq/fr3at28vSerZs6fq1aunZ555RsXFxSoqKtL06dNVVlamrKysCvf3/fffa/HixZo4ceIV2ywpKVFBQYHDAgAAANwKXD7Hoqa0bdtW6enp2rlzpyZPnqyYmBgdOHBAkuTv769169bpo48+ksVikY+Pj/Ly8tS1a1e5uZX/Sk6cOKGBAwfqoYce0h//+Mcrtjlv3jz5+PjYl6CgoBo7PgAAAOBG4lHVDX7++WcZhqG6detKko4dO2YfDejfv3+V9uXn5yd3d3eHpzFJUk5OzhUnZleWp6enQkNDJUnh4eHavXu3Fi1apGXLlkmS+vfvr8OHD+v06dPy8PBQgwYNFBgYqJCQEIf9nDx5Ur/97W/Vq1cv/fOf/7xqm3FxcYqNjbWvFxQUEC4AAABwS6jyiMXgwYO1cuVKSVJeXp4iIiL0t7/9TYMHD1ZCQkKV9uXp6anw8HAlJSXZy2w2m5KSkuzzIaqLzWZTSUlJuXI/Pz81aNBAycnJys3N1QMPPGD/7MSJE+rTp4/Cw8O1fPnyCkczfslsNtsfcXt5AQAAAG4FVQ4We/fu1V133SVJevfddxUQEKBjx45p5cqVevXVV6vcgdjYWL3++utasWKFDh48qMmTJ6uoqEjjxo2TJI0ZM0ZxcXH2+qWlpUpPT1d6erpKS0t14sQJpaen6/vvv7fXiYuL0+eff66jR48qIyNDcXFxSklJ0ahRo+x1li9fri+//FKHDx/Wv/71Lz300EOaNm2a2rZtK+n/hYoWLVooPj5ep06dUnZ2ttNzPwAAAICbUZVvhSouLlb9+vUlSZs3b9aDDz4oNzc39ezZU8eOHatyB0aMGKFTp05p1qxZys7OVufOnbVp0yb7hO7jx487jBScPHlSXbp0sa/Hx8crPj5evXv3VkpKiiQpNzdXY8aMUVZWlnx8fBQWFqbExET169fPvt2hQ4cUFxenM2fOKDg4WDNnztS0adPsn2/ZskXff/+9vv/+ezVv3tyhzy5+Qi8AAABww6nyeyzCwsL06KOPaujQobr99tu1adMmRUZGKi0tTffddx+/0f8F3mMBAACA2qxG32Mxa9YsTZ8+XcHBwYqIiLDPhdi8ebPDSAIAAACAW8c1vXk7OztbWVlZ6tSpk/02pV27dslqteq2226r9k7WVoxYAAAAoDaryvVsledYSFJgYKD9cbAFBQVKTk5W27ZtCRUAAADALarKt0INHz5cS5YskXTpnRbdunXT8OHDFRYWpvfee6/aOwgAAADgxlflYPH555/bHze7fv16GYahvLw8vfrqq/rrX/9a7R0EAAAAcOOrcrDIz89Xw4YNJUmbNm3SsGHDVLduXd1333367rvvqr2DAAAAAG58VQ4WQUFBSk1NVVFRkTZt2qT+/ftLks6ePSsvL69q7yAAAACAG1+VJ29PnTpVo0aNksViUcuWLdWnTx9Jl26R6tixY3X3DwAAAEAtUOVg8dhjj6lHjx768ccf1a9fP/vjZkNCQphjAQAAANyiruk9Fpdd3tRkMlVbh24mvMcCAAAAtVmNvnlbklauXKmOHTvK29tb3t7eCgsL09tvv31NnQUAAABQ+1X5Vqi///3veu655/TEE0/ojjvukCRt375dkyZN0unTpzVt2rRq7yQAAACAG1uVb4Vq1aqV5s6dqzFjxjiUr1ixQnPmzNGRI0eqtYO1GbdCAQAAoDar0VuhsrKy1KtXr3LlvXr1UlZWVlV3BwAAAOAmUOVgERoaqnfeeadc+dq1a9WmTZtq6RQAAACA2qXKcyzmzp2rESNG6PPPP7fPsdixY4eSkpIqDBwAAAAAbn5VHrEYNmyYdu7cKT8/P23YsEEbNmyQn5+fdu3apaFDh9ZEHwEAAADc4Jx6j8Uv5ebm6o033tBf/vKX6tjdTYHJ2wAAAKjNavw9FhXJysrSc889V127AwAAAFCLVFuwAAAAAHDrIlgAAAAAcBrBAgAAAIDTKv242djY2Kt+furUKac7AwAAAKB2qnSw2Ldv36/Wufvuu53qDAAAAIDaqdLB4rPPPqvJfgAAAACoxZhjAQAAAMBpBAsAAAAATiNYAAAAAHDaDREsli5dquDgYHl5eSkiIkK7du26Yt3MzEwNGzZMwcHBMplMWrhwYbk6CQkJCgsLk9VqldVqVWRkpDZu3OhQ5/Dhwxo6dKj8/f1ltVo1fPhw5eTkONQ5c+aMRo0aJavVqgYNGmj8+PE6d+5ctRwzAAAAcDNxebBYu3atYmNjNXv2bO3du1edOnXSgAEDlJubW2H94uJihYSEaP78+QoMDKywTvPmzTV//nylpaVpz5496tu3rwYPHqzMzExJUlFRkfr37y+TyaTk5GTt2LFDpaWlGjRokGw2m30/o0aNUmZmprZs2aKPP/5Yn3/+uSZMmFD9XwIAAABQ2xmV9NJLLxnFxcX29e3btxvnz5+3rxcUFBiTJ0+u7O7sevToYTz++OP29bKyMqNp06bGvHnzfnXbli1bGq+88kql2vH19TXeeOMNwzAMIzEx0XBzczPy8/Ptn+fl5Rkmk8nYsmWLYRiGceDAAUOSsXv3bnudjRs3GiaTyThx4kSl2szPzzckObQDAAAA1BZVuZ6t9IhFXFycCgsL7evR0dE6ceKEfb24uFjLli2rUqgpLS1VWlqaoqKi7GVubm6KiopSampqlfZ1JWVlZVqzZo2KiooUGRkpSSopKZHJZJLZbLbX8/Lykpubm7Zv3y5JSk1NVYMGDdStWzd7naioKLm5uWnnzp0VtlVSUqKCggKHBQAAALgVVDpYGIZx1fVrcfr0aZWVlSkgIMChPCAgQNnZ2U7tOyMjQxaLRWazWZMmTdL69evVvn17SVLPnj1Vr149PfPMMyouLlZRUZGmT5+usrIyZWVlSZKys7PVuHFjh316eHioYcOGV+zbvHnz5OPjY1+CgoKcOgYAAACgtnD5HIua0rZtW6Wnp2vnzp2aPHmyYmJidODAAUmSv7+/1q1bp48++kgWi0U+Pj7Ky8tT165d5eZ27V9JXFyc8vPz7cuPP/5YXYcDAAAA3NAq/ebtmuDn5yd3d/dyT2PKycm54sTsyvL09FRoaKgkKTw8XLt379aiRYvst2v1799fhw8f1unTp+Xh4aEGDRooMDBQISEhkqTAwMByE8gvXryoM2fOXLFvZrPZ4fYqAAAA4FZRpWDxxhtvyGKxSLp0kf3WW2/Jz89PkhzmX1SWp6enwsPDlZSUpCFDhkiSbDabkpKS9MQTT1R5f1djs9lUUlJSrvxy/5OTk5Wbm6sHHnhAkhQZGam8vDylpaUpPDzcXsdmsykiIqJa+wYAAADUdpUOFi1atNDrr79uXw8MDNTbb79drk5VxcbGKiYmRt26dVOPHj20cOFCFRUVady4cZKkMWPGqFmzZpo3b56kSxO+L9/SVFpaqhMnTig9PV0Wi8U+QhEXF6fo6Gi1aNFChYWFWrVqlVJSUpSYmGhvd/ny5WrXrp38/f2Vmpqqp556StOmTVPbtm0lSe3atdPAgQP1xz/+Ua+99pouXLigJ554Qg8//LCaNm1a5eMEAAAAbmaVDhZHjx6tkQ6MGDFCp06d0qxZs5Sdna3OnTtr06ZN9gndx48fd5j3cPLkSXXp0sW+Hh8fr/j4ePXu3VspKSmSpNzcXI0ZM0ZZWVny8fFRWFiYEhMT1a9fP/t2hw4dUlxcnM6cOaPg4GDNnDlT06ZNc+jb//3f/+mJJ57QPffcIzc3Nw0bNkyvvvpqjXwPAAAAQG1mMqrj8U6oUEFBgXx8fJSfny+r1erq7gAAAABVUpXr2Uo/Aik1NVUff/yxQ9nKlSvVqlUrNW7cWBMmTKhwDgMAAACAm1+lg8Xzzz+vzMxM+3pGRobGjx+vqKgozZgxQx999JF9HgQAAACAW0ulg0V6erruuece+/qaNWsUERGh119/XbGxsXr11Vf1zjvv1EgnAQAAANzYKh0szp496/CG7H//+9+Kjo62r3fv3p0XwgEAAAC3qEoHi4CAAB05ckTSpce87t27Vz179rR/XlhYqDp16lR/DwEAAADc8CodLO69917NmDFD27ZtU1xcnOrWrau77rrL/vlXX32l1q1b10gnAQAAANzYKv0eixdeeEEPPvigevfuLYvFohUrVsjT09P++Ztvvqn+/fvXSCcBAAAA3Niq/B6L/Px8WSwWubu7O5SfOXNGFovFIWzc6niPBQAAAGqzqlzPVnrE4jIfH58Kyxs2bFjVXQEAAAC4SVQ6WPzhD3+oVL0333zzmjsDAAAAoHaqdLB466231LJlS3Xp0kVVvHsKAAAAwE2u0sFi8uTJWr16tY4cOaJx48bpkUce4fYnAAAAAJKq8LjZpUuXKisrS3/+85/10UcfKSgoSMOHD1diYiIjGAAAAMAtrspPhbrs2LFjeuutt7Ry5UpdvHhRmZmZslgs1d2/Wo2nQgEAAKA2q8r1bKVHLMpt6OYmk8kkwzBUVlZ2rbsBAAAAcBOoUrAoKSnR6tWr1a9fP/3mN79RRkaGlixZouPHjzNaAQAAANzCKj15+7HHHtOaNWsUFBSkP/zhD1q9erX8/Pxqsm8AAAAAaolKz7Fwc3NTixYt1KVLF5lMpivWe//996utc7UdcywAAABQm9XIm7fHjBlz1UABAAAA4NZVpRfkAQAAAEBFrvmpUAAAAABwGcECAAAAgNMIFgAAAACcRrAAAAAA4DSCBQAAAACnESwAAAAAOI1gAQAAAMBpLg8WS5cuVXBwsLy8vBQREaFdu3ZdsW5mZqaGDRum4OBgmUwmLVy4sFydhIQEhYWFyWq1ymq1KjIyUhs3bnSok52drdGjRyswMFD16tVT165d9d577znU+fbbbzV48GD5+fnJarXqzjvv1GeffVYtxwwAAADcbFwaLNauXavY2FjNnj1be/fuVadOnTRgwADl5uZWWL+4uFghISGaP3++AgMDK6zTvHlzzZ8/X2lpadqzZ4/69u2rwYMHKzMz015nzJgxOnTokD788ENlZGTowQcf1PDhw7Vv3z57nfvvv18XL15UcnKy0tLS1KlTJ91///3Kzs6u3i8BAAAAuAmYDMMwXNV4RESEunfvriVLlkiSbDabgoKCNGXKFM2YMeOq2wYHB2vq1KmaOnXqr7bTsGFDLViwQOPHj5ckWSwWJSQkaPTo0fY6jRo10ksvvaRHH31Up0+flr+/vz7//HPdddddkqTCwkJZrVZt2bJFUVFRlTq+goIC+fj4KD8/X1artVLbAAAAADeKqlzPumzEorS0VGlpaQ4X6W5uboqKilJqamq1tFFWVqY1a9aoqKhIkZGR9vJevXpp7dq1OnPmjGw2m9asWaPz58+rT58+ki6FjLZt22rlypUqKirSxYsXtWzZMjVu3Fjh4eHV0jcAAADgZuLhqoZPnz6tsrIyBQQEOJQHBATom2++cWrfGRkZioyM1Pnz52WxWLR+/Xq1b9/e/vk777yjESNGqFGjRvLw8FDdunW1fv16hYaGSpJMJpO2bt2qIUOGqH79+nJzc1Pjxo21adMm+fr6XrHdkpISlZSU2NcLCgqcOg4AAACgtnD55O2a0LZtW6Wnp2vnzp2aPHmyYmJidODAAfvnzz33nPLy8rR161bt2bNHsbGxGj58uDIyMiRJhmHo8ccfV+PGjbVt2zbt2rVLQ4YM0aBBg5SVlXXFdufNmycfHx/7EhQUVOPHCgAAANwIXDbHorS0VHXr1tW7776rIUOG2MtjYmKUl5enDz744KrbV2WORVRUlFq3bq1ly5bp8OHDCg0N1ddff60OHTo41AkNDdVrr72mpKQk9e/fX2fPnnW4l6xNmzYaP378Fed/VDRiERQUxBwLAAAA1Eq1Yo6Fp6enwsPDlZSUZC+z2WxKSkpymA9RHWw2m/2Cv7i4WNKl+Ry/5O7uLpvNdtU6bm5u9joVMZvN9sfcXl4AAACAW4HL5lhIUmxsrGJiYtStWzf16NFDCxcuVFFRkcaNGyfp0mNhmzVrpnnz5km6NMpx+Zam0tJSnThxQunp6bJYLPb5EXFxcYqOjlaLFi1UWFioVatWKSUlRYmJiZKk2267TaGhoZo4caLi4+PVqFEjbdiwQVu2bNHHH38sSYqMjJSvr69iYmI0a9YseXt76/XXX9eRI0d03333Xe+vCQAAALjhuTRYjBgxQqdOndKsWbOUnZ2tzp07a9OmTfYJ3cePH3cYNTh58qS6dOliX4+Pj1d8fLx69+6tlJQUSVJubq7GjBmjrKws+fj4KCwsTImJierXr58kqU6dOvr00081Y8YMDRo0SOfOnVNoaKhWrFihe++9V5Lk5+enTZs2aebMmerbt68uXLigDh066IMPPlCnTp2u07cDAAAA1B4ufY/FzY73WAAAAKA2qxVzLAAAAADcPAgWAAAAAJxGsAAAAADgNIIFAAAAAKcRLAAAAAA4jWABAAAAwGkECwAAAABOI1gAAAAAcBrBAgAAAIDTCBYAAAAAnEawAAAAAOA0ggUAAAAApxEsAAAAADiNYAEAAADAaQQLAAAAAE4jWAAAAABwGsECAAAAgNMIFgAAAACcRrAAAAAA4DSCBQAAAACnESwAAAAAOI1gAQAAAMBpBAsAAAAATiNYAAAAAHAawQIAAACA0wgWAAAAAJxGsAAAAADgNIIFAAAAAKe5PFgsXbpUwcHB8vLyUkREhHbt2nXFupmZmRo2bJiCg4NlMpm0cOHCcnUSEhIUFhYmq9Uqq9WqyMhIbdy40aFOdna2Ro8ercDAQNWrV09du3bVe++9V25fn3zyiSIiIuTt7S1fX18NGTLE2cMFAAAAbkouDRZr165VbGysZs+erb1796pTp04aMGCAcnNzK6xfXFyskJAQzZ8/X4GBgRXWad68uebPn6+0tDTt2bNHffv21eDBg5WZmWmvM2bMGB06dEgffvihMjIy9OCDD2r48OHat2+fvc57772n0aNHa9y4cdq/f7927Nih3//+99X7BQAAAAA3CZNhGIarGo+IiFD37t21ZMkSSZLNZlNQUJCmTJmiGTNmXHXb4OBgTZ06VVOnTv3Vdho2bKgFCxZo/PjxkiSLxaKEhASNHj3aXqdRo0Z66aWX9Oijj+rixYsKDg7W3Llz7dtci4KCAvn4+Cg/P19Wq/Wa9wMAAAC4QlWuZ102YlFaWqq0tDRFRUX9v864uSkqKkqpqanV0kZZWZnWrFmjoqIiRUZG2st79eqltWvX6syZM7LZbFqzZo3Onz+vPn36SJL27t2rEydOyM3NTV26dFGTJk0UHR2tr7/++qrtlZSUqKCgwGEBAAAAbgUuCxanT59WWVmZAgICHMoDAgKUnZ3t1L4zMjJksVhkNps1adIkrV+/Xu3bt7d//s477+jChQtq1KiRzGazJk6cqPXr1ys0NFSS9MMPP0iS5syZo2effVYff/yxfH191adPH505c+aK7c6bN08+Pj72JSgoyKnjAAAAAGoLl0/erglt27ZVenq6du7cqcmTJysmJkYHDhywf/7cc88pLy9PW7du1Z49exQbG6vhw4crIyND0qVbsiRp5syZGjZsmMLDw7V8+XKZTCatW7fuiu3GxcUpPz/fvvz44481e6AAAADADcLDVQ37+fnJ3d1dOTk5DuU5OTlXnJhdWZ6envbRh/DwcO3evVuLFi3SsmXLdPjwYS1ZskRff/21OnToIEnq1KmTtm3bpqVLl+q1115TkyZNJMlhlMNsNiskJETHjx+/Yrtms1lms9mpvgMAAAC1kctGLDw9PRUeHq6kpCR7mc1mU1JSksN8iOpgs9lUUlIi6dKTpaRL8zl+yd3d3T5SER4eLrPZrEOHDtk/v3Dhgo4ePaqWLVtWa98AAACAm4HLRiwkKTY2VjExMerWrZt69OihhQsXqqioSOPGjZN06bGwzZo107x58yRdmvB9+Zam0tJSnThxQunp6bJYLPYRiri4OEVHR6tFixYqLCzUqlWrlJKSosTEREnSbbfdptDQUE2cOFHx8fFq1KiRNmzYoC1btujjjz+WJFmtVk2aNEmzZ89WUFCQWrZsqQULFkiSHnrooev6HQEAAAC1gUuDxYgRI3Tq1CnNmjVL2dnZ6ty5szZt2mSf0H38+HGHkYWTJ0+qS5cu9vX4+HjFx8erd+/eSklJkSTl5uZqzJgxysrKko+Pj8LCwpSYmKh+/fpJkurUqaNPP/1UM2bM0KBBg3Tu3DmFhoZqxYoVuvfee+37XrBggTw8PDR69Gj9/PPPioiIUHJysnx9fa/DNwMAAADULi59j8XNjvdYAAAAoDarFe+xAAAAAHDzIFgAAAAAcBrBAgAAAIDTCBYAAAAAnEawAAAAAOA0ggUAAAAApxEsAAAAADiNYAEAAADAaQQLAAAAAE4jWAAAAABwGsECAAAAgNMIFgAAAACcRrAAAAAA4DSCBQAAAACnESwAAAAAOI1gAQAAAMBpBAsAAAAATiNYAAAAAHAawQIAAACA0wgWAAAAAJxGsAAAAADgNIIFAAAAAKcRLAAAAAA4jWABAAAAwGkECwAAAABOI1gAAAAAcBrBAgAAAIDTCBYAAAAAnHZDBIulS5cqODhYXl5eioiI0K5du65YNzMzU8OGDVNwcLBMJpMWLlxYrk5CQoLCwsJktVpltVoVGRmpjRs3OtTJzs7W6NGjFRgYqHr16qlr16567733KmyzpKREnTt3lslkUnp6ujOHCgAAANyUXB4s1q5dq9jYWM2ePVt79+5Vp06dNGDAAOXm5lZYv7i4WCEhIZo/f74CAwMrrNO8eXPNnz9faWlp2rNnj/r27avBgwcrMzPTXmfMmDE6dOiQPvzwQ2VkZOjBBx/U8OHDtW/fvnL7+/Of/6ymTZtWzwEDAAAANyGTYRiGKzsQERGh7t27a8mSJZIkm82moKAgTZkyRTNmzLjqtsHBwZo6daqmTp36q+00bNhQCxYs0Pjx4yVJFotFCQkJGj16tL1Oo0aN9NJLL+nRRx+1l23cuFGxsbF677331KFDB+3bt0+dO3eu1LEVFBTIx8dH+fn5slqtldoGAAAAuFFU5XrWpSMWpaWlSktLU1RUlL3Mzc1NUVFRSk1NrZY2ysrKtGbNGhUVFSkyMtJe3qtXL61du1ZnzpyRzWbTmjVrdP78efXp08deJycnR3/84x/19ttvq27dur/aVklJiQoKChwWAAAA4Fbg0mBx+vRplZWVKSAgwKE8ICBA2dnZTu07IyNDFotFZrNZkyZN0vr169W+fXv75++8844uXLigRo0ayWw2a+LEiVq/fr1CQ0MlSYZhaOzYsZo0aZK6detWqTbnzZsnHx8f+xIUFOTUMQAAAAC1hcvnWNSUtm3bKj09XTt37tTkyZMVExOjAwcO2D9/7rnnlJeXp61bt2rPnj2KjY3V8OHDlZGRIUlavHixCgsLFRcXV+k24+LilJ+fb19+/PHHaj8uAAAA4Ebk4crG/fz85O7urpycHIfynJycK07MrixPT0/76EN4eLh2796tRYsWadmyZTp8+LCWLFmir7/+Wh06dJAkderUSdu2bdPSpUv12muvKTk5WampqTKbzQ777datm0aNGqUVK1aUa9NsNperDwAAANwKXDpi4enpqfDwcCUlJdnLbDabkpKSHOZDVAebzaaSkhJJl54sJV2az/FL7u7ustlskqRXX31V+/fvV3p6utLT0/Xpp59KuvQUqxdffLFa+wYAAADUdi4dsZCk2NhYxcTEqFu3burRo4cWLlyooqIijRs3TtKlx8I2a9ZM8+bNk3RpwvflW5pKS0t14sQJpaeny2Kx2Eco4uLiFB0drRYtWqiwsFCrVq1SSkqKEhMTJUm33XabQkNDNXHiRMXHx6tRo0basGGDtmzZoo8//liS1KJFC4d+WiwWSVLr1q3VvHnzmv9iAAAAgFrE5cFixIgROnXqlGbNmqXs7Gx17txZmzZtsk/oPn78uMPIwsmTJ9WlSxf7enx8vOLj49W7d2+lpKRIknJzczVmzBhlZWXJx8dHYWFhSkxMVL9+/SRJderU0aeffqoZM2Zo0KBBOnfunEJDQ7VixQrde++91+/gAQAAgJuEy99jcTPjPRYAAACozWrNeywAAAAA3BwIFgAAAACcRrAAAAAA4DSCBQAAAACnESwAAAAAOI1gAQAAAMBpBAsAAAAATiNYAAAAAHAawQIAAACA0wgWAAAAAJxGsAAAAADgNIIFAAAAAKcRLAAAAAA4jWABAAAAwGkECwAAAABOI1gAAAAAcBrBAgAAAIDTCBYAAAAAnEawAAAAAOA0ggUAAAAApxEsAAAAADiNYAEAAADAaR6u7sDNzDAMSVJBQYGLewIAAABU3eXr2MvXtVdDsKhBhYWFkqSgoCAX9wQAAAC4doWFhfLx8blqHZNRmfiBa2Kz2XTy5EnVr19fJpPJ1d25ZRQUFCgoKEg//vijrFarq7uD64TzfmvivN+aOO+3Js67axiGocLCQjVt2lRublefRcGIRQ1yc3NT8+bNXd2NW5bVauUfnlsQ5/3WxHm/NXHeb02c9+vv10YqLmPyNgAAAACnESwAAAAAOI1ggZuO2WzW7NmzZTabXd0VXEec91sT5/3WxHm/NXHeb3xM3gYAAADgNEYsAAAAADiNYAEAAADAaQQLAAAAAE4jWKBWOnPmjEaNGiWr1aoGDRpo/PjxOnfu3FW3OX/+vB5//HE1atRIFotFw4YNU05OToV1f/rpJzVv3lwmk0l5eXk1cASoqpo45/v379fIkSMVFBQkb29vtWvXTosWLarpQ8FVLF26VMHBwfLy8lJERIR27dp11frr1q3TbbfdJi8vL3Xs2FGffvqpw+eGYWjWrFlq0qSJvL29FRUVpe+++64mDwHXoDrP+4ULF/TMM8+oY8eOqlevnpo2baoxY8bo5MmTNX0YqKLq/vv+S5MmTZLJZNLChQurude4KgOohQYOHGh06tTJ+PLLL41t27YZoaGhxsiRI6+6zaRJk4ygoCAjKSnJ2LNnj9GzZ0+jV69eFdYdPHiwER0dbUgyzp49WwNHgKqqiXP+v//7v8aTTz5ppKSkGIcPHzbefvttw9vb21i8eHFNHw4qsGbNGsPT09N48803jczMTOOPf/yj0aBBAyMnJ6fC+jt27DDc3d2Nl19+2Thw4IDx7LPPGnXq1DEyMjLsdebPn2/4+PgYGzZsMPbv32888MADRqtWrYyff/75eh0WfkV1n/e8vDwjKirKWLt2rfHNN98YqampRo8ePYzw8PDreVj4FTXx9/2y999/3+jUqZPRtGlT45VXXqnhI8EvESxQ6xw4cMCQZOzevdtetnHjRsNkMhknTpyocJu8vDyjTp06xrp16+xlBw8eNCQZqampDnX/8Y9/GL179zaSkpIIFjeImj7nv/TYY48Zv/3tb6uv86i0Hj16GI8//rh9vayszGjatKkxb968CusPHz7cuO+++xzKIiIijIkTJxqGYRg2m80IDAw0FixYYP88Ly/PMJvNxurVq2vgCHAtqvu8V2TXrl2GJOPYsWPV02k4rabO+3/+8x+jWbNmxtdff220bNmSYHGdcSsUap3U1FQ1aNBA3bp1s5dFRUXJzc1NO3furHCbtLQ0XbhwQVFRUfay2267TS1atFBqaqq97MCBA3r++ee1cuVKubnx1+NGUZPn/L/l5+erYcOG1dd5VEppaanS0tIczpebm5uioqKueL5SU1Md6kvSgAED7PWPHDmi7Oxshzo+Pj6KiIi46p8BXD81cd4rkp+fL5PJpAYNGlRLv+GcmjrvNptNo0eP1tNPP60OHTrUTOdxVVw5odbJzs5W48aNHco8PDzUsGFDZWdnX3EbT0/Pcv9TCQgIsG9TUlKikSNHasGCBWrRokWN9B3XpqbO+X/74osvtHbtWk2YMKFa+o3KO336tMrKyhQQEOBQfrXzlZ2dfdX6l/9blX3i+qqJ8/7fzp8/r2eeeUYjR46U1Wqtno7DKTV13l966SV5eHjoySefrP5Oo1IIFrhhzJgxQyaT6arLN998U2Ptx8XFqV27dnrkkUdqrA04cvU5/6Wvv/5agwcP1uzZs9W/f//r0iaAmnXhwgUNHz5chmEoISHB1d1BDUpLS9OiRYv01ltvyWQyubo7tywPV3cAuOxPf/qTxo4de9U6ISEhCgwMVG5urkP5xYsXdebMGQUGBla4XWBgoEpLS5WXl+fwG+ycnBz7NsnJycrIyNC7774r6dLTZCTJz89PM2fO1Ny5c6/xyHAlrj7nlx04cED33HOPJkyYoGefffaajgXO8fPzk7u7e7kntVV0vi4LDAy8av3L/83JyVGTJk0c6nTu3Lkae49rVRPn/bLLoeLYsWNKTk5mtOIGUhPnfdu2bcrNzXW446CsrEx/+tOftHDhQh09erR6DwIVYsQCNwx/f3/ddtttV108PT0VGRmpvLw8paWl2bdNTk6WzWZTREREhfsODw9XnTp1lJSUZC87dOiQjh8/rsjISEnSe++9p/379ys9PV3p6el64403JF36x+rxxx+vwSO/dbn6nEtSZmamfvvb3yomJkYvvvhizR0srsrT01Ph4eEO58tmsykpKcnhfP1SZGSkQ31J2rJli71+q1atFBgY6FCnoKBAO3fuvOI+cX3VxHmX/l+o+O6777R161Y1atSoZg4A16Qmzvvo0aP11Vdf2f8fnp6erqZNm+rpp59WYmJizR0MHLl69jhwLQYOHGh06dLF2Llzp7F9+3ajTZs2Do8e/c9//mO0bdvW2Llzp71s0qRJRosWLYzk5GRjz549RmRkpBEZGXnFNj777DOeCnUDqYlznpGRYfj7+xuPPPKIkZWVZV9yc3Ov67HhkjVr1hhms9l46623jAMHDhgTJkwwGjRoYGRnZxuGYRijR482ZsyYYa+/Y8cOw8PDw4iPjzcOHjxozJ49u8LHzTZo0MD44IMPjK+++soYPHgwj5u9wVT3eS8tLTUeeOABo3nz5kZ6errD3+2SkhKXHCPKq4m/7/+Np0JdfwQL1Eo//fSTMXLkSMNisRhWq9UYN26cUVhYaP/8yJEjhiTjs88+s5f9/PPPxmOPPWb4+voadevWNYYOHWpkZWVdsQ2CxY2lJs757NmzDUnllpYtW17HI8MvLV682GjRooXh6elp9OjRw/jyyy/tn/Xu3duIiYlxqP/OO+8Yv/nNbwxPT0+jQ4cOxieffOLwuc1mM5577jkjICDAMJvNxj333GMcOnToehwKqqA6z/vlfwsqWn757wNcr7r/vv83gsX1ZzKM//9GcgAAAAC4RsyxAAAAAOA0ggUAAAAApxEsAAAAADiNYAEAAADAaQQLAAAAAE4jWAAAAABwGsECAAAAgNMIFgAAAACcRrAAANxyTCaTNmzY4OpuAMBNhWABALiuxo4dK5PJVG4ZOHCgq7sGAHCCh6s7AAC49QwcOFDLly93KDObzS7qDQCgOjBiAQC47sxmswIDAx0WX19fSZduU0pISFB0dLS8vb0VEhKid99912H7jIwM9e3bV97e3mrUqJEmTJigc+fOOdR588031aFDB5nNZjVp0kRPPPGEw+enT5/W0KFDVbduXbVp00YffvhhzR40ANzkCBYAgBvOc889p2HDhmn//v0aNWqUHn74YR08eFCSVFRUpAEDBsjX11e7d+/WunXrtHXrVofgkJCQoMcff1wTJkxQRkaGPvzwQ4WGhjq0MXfuXA0fPlxfffWV7r33Xo0aNUpnzpy5rscJADcTk2EYhqs7AQC4dYwdO1b/+te/5OXl5VD+l7/8RX/5y19kMpk0adIkJSQk2D/r2bOnunbtqn/84x96/fXX9cwzz+jHH39UvXr1JEmffvqpBg0apJMnTyogIEDNmjXTuHHj9Ne//rXCPphMJj377LN64YUXJF0KKxaLRRs3bmSuBwBcI+ZYAACuu9/+9rcOwUGSGjZsaP85MjLS4bPIyEilp6dLkg4ePKhOnTrZQ4Uk3XHHHbLZbDp06JBMJpNOnjype+6556p9CAsLs/9cr149Wa1W5ebmXushAcAtj2ABALju6tWrV+7WpOri7e1dqXp16tRxWDeZTLLZbDXRJQC4JTDHAgBww/nyyy/Lrbdr106S1K5dO+3fv19FRUX2z3fs2CE3Nze1bdtW9evXV3BwsJKSkq5rnwHgVseIBQDguispKVF2drZDmYeHh/z8/CRJ69atU7du3XTnnXfq//7v/7Rr1y797//+ryRp1KhRmj17tmJiYjRnzhydOnVKU6ZM0ejRoxUQECBJmjNnjiZNmqTGjRsrOjpahYWF2rFjh6ZMmXJ9DxQAbiEECwDAdbdp0yY1adLEoaxt27b65ptvJF16YtOaNWv02GOPqUmTJlq9erXat28vSapbt64SExP11FNPqXv37qpbt66GDRumv//97/Z9xcTE6Pz583rllVc0ffp0+fn56Xe/+931O0AAuAXxVCgAwA3FZDJp/fr1GjJkiKu7AgCoAuZYAAAAAHAawQIAAACA05hjAQC4oXCHLgDUToxYAAAAAHAawQIAAACA0wgWAAAAAJxGsAAAAADgNIIFAAAAAKcRLAAAAAA4jWABAAAAwGkECwAAAABOI1gAAAAAcNr/B5wzbdfAcL2ZAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.figure(figsize=(8, 4))\n", + "plt.plot(train_losses, label=\"Train\")\n", + "plt.plot(val_losses, label=\"Val\")\n", + "plt.xlabel(\"Epoch\")\n", + "plt.ylabel(\"MSE Loss\")\n", + "plt.title(\"Training & Validation Loss\")\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "74dd610894ef48b8853583908a4fa0ce", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading pipeline components...: 0%| | 0/6 [00:00 by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (351 > 77). Running this sequence through the model will result in indexing errors\n", + "The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['your deck .\", \" energytype \": null , \" evolvefrom \": null , \" hp \": null , \" id \": \" sv 0 8 . 5 - 1 0 5 \", \" illustrator \": \" gidora \", \" image \": \" https :// assets . tcgdex . net / en / sv / sv 0 8 . 5 / 1 0 5 \", \" item \": null , \" legal \": {\" expanded \": true , \" standard \": true }, \" level \": null , \" localid \": \" 1 0 5 \", \" name \": \" crispin \", \" rarity \": \" uncommon \", \" regulationmark \": \" h \", \" resistances \": null , \" retreat \": null , \" set \": {\" cardcount \": {\" official \": 1 3 1 , \" total \": 1 8 0 }, \" id \": \" sv 0 8 . 5 \", \" logo \": \" https :// assets . tcgdex . net / en / sv / sv 0 8 . 5 / logo \", \" name \": \" prismatic evolutions \", \" symbol \": null }, \" stage \": null , \" suffix \": null , \" trainertype \": \" supporter \", \" types \": null , \" variants \": {\" firstedition \": false , \" holo \": true , \" normal \": true , \" reverse \": true , \" wpromo \": false }, \" weaknesses \": null }']\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dbfa4d8442f44140b9e0b9cd17e2b64a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/30 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Visualiser des générations sur des prompts de validation\n", + "from diffusers import StableDiffusionPipeline\n", + "import random\n", + "import copy\n", + "\n", + "unet.eval()\n", + "# Merge LoRA weights into a COPY so the original unet stays trainable\n", + "unet_copy = copy.deepcopy(unet)\n", + "unet_merged = unet_copy.merge_and_unload()\n", + "\n", + "pipe = StableDiffusionPipeline.from_pretrained(\n", + " model_id,\n", + " vae=vae,\n", + " unet=unet_merged,\n", + " text_encoder=text_encoder,\n", + " tokenizer=tokenizer,\n", + " safety_checker=None,\n", + ")\n", + "pipe = pipe.to(device)\n", + "del unet_copy # free memory\n", + "\n", + "indices = random.sample(range(len(val_dataset)), 4)\n", + "\n", + "def get_val_path(i):\n", + " dataset_idx = int(val_dataset.indices[i]) # tensor → int\n", + " return val_dataset.dataset.image_paths[dataset_idx]\n", + "\n", + "conditionings = [metadata_to_conditioning(get_metadata_from_png(get_val_path(indices[i]))) for i in range(4)]\n", + "\n", + "fig, axes = plt.subplots(2, 4, figsize=(16, 8))\n", + "for i, cond in enumerate(conditionings):\n", + " out = pipe(cond, num_inference_steps=30, guidance_scale=7.5).images[0]\n", + " axes[0, i].imshow(out)\n", + " axes[0, i].axis(\"off\")\n", + " axes[0, i].set_title(cond[:50] + \"...\", fontsize=8)\n", + "\n", + " real = Image.open(get_val_path(indices[i])).convert(\"RGB\").resize((512, 512))\n", + " axes[1, i].imshow(real)\n", + " axes[1, i].axis(\"off\")\n", + " axes[1, i].set_title(\"Ground truth\", fontsize=8)\n", + "\n", + "plt.suptitle(\"Generated vs Ground Truth\")\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_card_from_metadata(meta, num_inference_steps=30, guidance_scale=7.5, save_path=None):\n", + " \"\"\"Génère une carte à partir d'un dict au format pokemon_metadata (sortie de ta pipeline prompt → JSON).\"\"\"\n", + " conditioning = metadata_to_conditioning(meta)\n", + " image = pipe(conditioning, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).images[0]\n", + " if save_path:\n", + " image.save(save_path)\n", + " return image\n", + "\n", + "# Exemple : meta = sortie de ta pipeline (prompt utilisateur → JSON)\n", + "example_meta = {\n", + " \"name\": \"Charizard\", \"types\": [\"Fire\"], \"hp\": 120, \"stage\": \"Stage2\", \"rarity\": \"Rare\",\n", + " \"attacks\": [{\"name\": \"Fire Spin\", \"damage\": \"100\"}, {\"name\": \"Flamethrower\", \"damage\": \"50\"}],\n", + "}\n", + "out = generate_card_from_metadata(example_meta)\n", + "out" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['], \" category \": \" pokemon \", \" description \": \" it makes a nest to suit its long and skinny body . the nest is impossible for other pokémon to enter .\", \" evolvefrom \": \" sentret \", \" hp \": 1 1 0 , \" id \": \" swsh 3 - 1 3 6 \", \" illustrator \": \" tetsuya koizumi \", \" image \": \" https :// assets . tcgdex . net / en / swsh / swsh 3 / 1 3 6 \", \" localid \": \" 1 3 6 \", \" name \": \" furret \", \" rarity \": \" uncommon \", \" regulationmark \": \" d \", \" retreat \": 1 , \" set \": {\" cardcount \": {\" official \": 1 8 9 , \" total \": 2 0 1 }, \" id \": \" swsh 3 \", \" logo \": \" https :// assets . tcgdex . net / en / swsh / swsh 3 / logo \", \" name \": \" darkness ablaze \", \" symbol \": \" https :// assets . tcgdex . net / univ / swsh / swsh 3 / symbol \"}, \" stage \": \" stage 1 \", \" types \": [\" colorless \"], \" weaknesses \": [{\" type \": \" fighting \", \" value \": \"× 2 \"}]}']\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b90f4851a87f4feb83ee481149dd3347", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/75 [00:00" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def generate_card_from_metadata(meta, num_inference_steps=30, guidance_scale=7.5, save_path=None):\n", + " \"\"\"Génère une carte à partir d'un dict au format pokemon_metadata (sortie de ta pipeline prompt → JSON).\"\"\"\n", + " conditioning = metadata_to_conditioning(meta)\n", + " image = pipe(conditioning, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).images[0]\n", + " if save_path:\n", + " image.save(save_path)\n", + " return image\n", + "\n", + "# Exemple : meta = sortie de ta pipeline (prompt utilisateur → JSON)\n", + "example_meta = {\n", + " \"category\": \"Pokemon\",\n", + " \"id\": \"swsh3-136\",\n", + " \"illustrator\": \"tetsuya koizumi\",\n", + " \"image\": \"https://assets.tcgdex.net/en/swsh/swsh3/136\",\n", + " \"localId\": \"136\",\n", + " \"name\": \"Furret\",\n", + " \"rarity\": \"Uncommon\",\n", + " \"set\": {\n", + " \"cardCount\": {\n", + " \"official\": 189,\n", + " \"total\": 201\n", + " },\n", + " \"id\": \"swsh3\",\n", + " \"logo\": \"https://assets.tcgdex.net/en/swsh/swsh3/logo\",\n", + " \"name\": \"Darkness Ablaze\",\n", + " \"symbol\": \"https://assets.tcgdex.net/univ/swsh/swsh3/symbol\"\n", + " },\n", + " \"hp\": 110,\n", + " \"types\": [\n", + " \"Colorless\"\n", + " ],\n", + " \"evolveFrom\": \"Sentret\",\n", + " \"description\": \"It makes a nest to suit its long and skinny body. The nest is impossible for other Pokémon to enter.\",\n", + " \"stage\": \"Stage1\",\n", + " \"attacks\": [\n", + " {\n", + " \"cost\": [\n", + " \"Colorless\"\n", + " ],\n", + " \"name\": \"Feelin' Fine\",\n", + " \"effect\": \"Draw 3 cards.\"\n", + " },\n", + " {\n", + " \"cost\": [\n", + " \"Colorless\"\n", + " ],\n", + " \"name\": \"Tail Smash\",\n", + " \"effect\": \"Flip a coin. If tails, this attack does nothing.\",\n", + " \"damage\": 90\n", + " }\n", + " ],\n", + " \"weaknesses\": [\n", + " {\n", + " \"type\": \"Fighting\",\n", + " \"value\": \"×2\"\n", + " }\n", + " ],\n", + " \"retreat\": 1,\n", + " \"regulationMark\": \"D\"\n", + "}\n", + "out = generate_card_from_metadata(example_meta)\n", + "out" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LoRA saved to /home/do5-ajlp/juicepyter/pokemon_card_lora (1 epochs total)\n" + ] + } + ], + "source": [ + "# Sauvegarder le LoRA et l'historique d'entraînement\n", + "LORA_PATH = \"/home/do5-ajlp/juicepyter/pokemon_card_lora\"\n", + "unet.save_pretrained(LORA_PATH)\n", + "torch.save({\n", + " \"train_losses\": train_losses,\n", + " \"val_losses\": val_losses,\n", + " \"epochs\": len(train_losses),\n", + "}, LORA_PATH + \"/training_history.pt\")\n", + "print(f\"LoRA saved to {LORA_PATH} ({len(train_losses)} epochs total)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/prompt_to_card_pipeline.py b/prompt_to_card_pipeline.py new file mode 100644 index 0000000..bf34091 --- /dev/null +++ b/prompt_to_card_pipeline.py @@ -0,0 +1,346 @@ +"""End-to-end prompt -> cleaned text -> inferred JSON -> generated card image. + +This script is built to connect the three stages described by the user: +1) call get_clean_text(user_text) from a text-cleaning module file +2) pass cleaned text into infer_json_usage.py with --json-only --template +3) load a checkpoint and generate a card image from inferred metadata + +The model-loading part is intentionally pluggable because checkpoint structures vary. +If your .pt checkpoint cannot be used directly as a callable pipeline, provide a +generator module implementing: + + def build_pipeline(checkpoint_path: str, device: str): ... + def metadata_to_conditioning(meta: dict) -> str: ... # optional +""" + +from __future__ import annotations + +import argparse +import importlib +import importlib.util +import json +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Mapping + + +def _load_module_from_file(module_file: str): + module_path = Path(module_file).resolve() + if not module_path.exists(): + raise FileNotFoundError(f"Module file not found: {module_path}") + + spec = importlib.util.spec_from_file_location(module_path.stem, str(module_path)) + if spec is None or spec.loader is None: + raise ImportError(f"Cannot import module from file: {module_path}") + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + print("module successfully charged") + return module + + +def _load_function_from_file(module_file: str, function_name: str) -> Callable[..., Any]: + print("model charging 1") + module = _load_module_from_file(module_file) + print("model charged 1") + if not hasattr(module, function_name): + raise AttributeError(f"{module_file} has no function named '{function_name}'") + func = getattr(module, function_name) + if not callable(func): + raise TypeError(f"{function_name} in {module_file} is not callable") + return func + + +def _extract_json_from_output(raw: str) -> Mapping[str, Any]: + print("_extract_json_from_output") + stripped = raw.strip() + if not stripped: + raise ValueError("Inference command returned empty output") + + try: + parsed = json.loads(stripped) + if not isinstance(parsed, dict): + raise ValueError("Inference output is JSON but not an object") + return parsed + except json.JSONDecodeError: + pass + + # Fallback: parse the last JSON object in mixed stdout. + last_open = stripped.rfind("{") + last_close = stripped.rfind("}") + if last_open == -1 or last_close == -1 or last_close <= last_open: + raise ValueError(f"Could not parse JSON from inference output:\n{raw}") + + candidate = stripped[last_open : last_close + 1] + parsed = json.loads(candidate) + print("json parsed with success") + if not isinstance(parsed, dict): + raise ValueError("Parsed fallback JSON is not an object") + return parsed + + +def run_infer_json_cli( + infer_script_path: str, + template_path: str, + cleaned_text: str, + python_executable: str | None = None, +) -> Mapping[str, Any]: + infer_script = Path(infer_script_path).resolve() + print("run_infer_json_cli") + if not infer_script.exists(): + raise FileNotFoundError(f"infer_json_usage.py not found: {infer_script}") + + template_file = Path(template_path).resolve() + if not template_file.exists(): + raise FileNotFoundError(f"Template file not found: {template_file}") + + cmd = [ + python_executable or sys.executable, + str(infer_script), + "--json-only", + "--template", + str(template_file), + cleaned_text, + ] + print("will start result") + result = subprocess.run(cmd, capture_output=True, text=True, check=False) + + if result.returncode != 0: + stderr = result.stderr.strip() + raise RuntimeError( + "JSON inference command failed. " + f"exit={result.returncode}, stderr={stderr or ''}" + ) + print("result is done") + return _extract_json_from_output(result.stdout) + + +def default_metadata_to_conditioning(meta: Mapping[str, Any]) -> str: + print("default_metadata_to_conditioning") + name = str(meta.get("name", "Unknown Pokemon")) + types = meta.get("types") or [] + if isinstance(types, list): + type_text = ", ".join(str(item) for item in types if item) or str(meta.get("type", "normal")) + else: + type_text = str(meta.get("type", "normal")) + + attacks = meta.get("attacks") or [] + attack_names = [] + if isinstance(attacks, list): + for attack in attacks: + if isinstance(attack, dict): + value = attack.get("name") + if value: + attack_names.append(str(value)) + elif attack: + attack_names.append(str(attack)) + + hp = str(meta.get("hp", "60")) + description = str(meta.get("description", "")) + + parts = [ + f"Pokemon trading card illustration of {name}", + f"type: {type_text}", + f"hp: {hp}", + ] + if attack_names: + parts.append(f"attacks: {', '.join(attack_names[:2])}") + if description: + parts.append(f"description: {description}") + return "; ".join(parts) + + +@dataclass +class CheckpointCardGenerator: + checkpoint_path: str + device: str = "cpu" + generator_module_path: str = "" + + def __post_init__(self) -> None: + self._pipe = self._build_pipe() + self._metadata_to_conditioning = self._build_conditioning_function() + + def _build_pipe(self): + if self.generator_module_path: + print("getting module") + module = _load_module_from_file(self.generator_module_path) + print("module got") + if not hasattr(module, "build_pipeline"): + raise AttributeError( + "Custom generator module must define build_pipeline(checkpoint_path, device)." + ) + print("building pipeline") + build_pipeline = getattr(module, "build_pipeline") + if not callable(build_pipeline): + raise TypeError("build_pipeline exists but is not callable") + print("pipeline build") + return build_pipeline(self.checkpoint_path, self.device) + + # Best-effort direct checkpoint loading for simple callable pipeline dumps. + try: + torch = importlib.import_module("torch") + except ModuleNotFoundError as exc: + raise RuntimeError( + "torch is required to load checkpoint files. Install torch or provide --generator-module." + ) from exc + print("loading checkpoint") + checkpoint = torch.load(self.checkpoint_path, map_location=self.device) + print("checkpoint loaded") + + if callable(checkpoint): + return checkpoint + + if isinstance(checkpoint, dict): + for key in ("pipe", "pipeline", "model"): + candidate = checkpoint.get(key) + if callable(candidate): + return candidate + + raise RuntimeError( + "Could not construct a callable generation pipeline from checkpoint. " + "Pass --generator-module with a build_pipeline() function for your model layout." + ) + + def _build_conditioning_function(self) -> Callable[[Mapping[str, Any]], str]: + if self.generator_module_path: + print("model charge 2") + module = _load_module_from_file(self.generator_module_path) + print("model charged 2") + if hasattr(module, "metadata_to_conditioning"): + func = getattr(module, "metadata_to_conditioning") + if callable(func): + return func + return default_metadata_to_conditioning + + def generate_card_from_metadata( + self, + meta: Mapping[str, Any], + num_inference_steps: int = 30, + guidance_scale: float = 7.5, + save_path: str | None = None, + ): + conditioning = self._metadata_to_conditioning(meta) + result = self._pipe( + conditioning, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + ) + + if not hasattr(result, "images") or not result.images: + raise RuntimeError( + "Pipeline call did not return an object with non-empty .images. " + "Ensure your pipeline follows diffusers-style output." + ) + + image = result.images[0] + if save_path: + output_file = Path(save_path).resolve() + output_file.parent.mkdir(parents=True, exist_ok=True) + image.save(str(output_file)) + return image + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Run text cleaning + JSON inference + card generation in one command.", + ) + parser.add_argument("text", help="User input text.") + parser.add_argument( + "--text-cleaner-path", + required=True, + help="Path to text-cleaning-pipeline.py that defines get_clean_text(text).", + ) + parser.add_argument( + "--infer-script-path", + required=True, + help="Path to infer_json_usage.py.", + ) + parser.add_argument( + "--template", + required=True, + help="Path to JSON template file.", + ) + parser.add_argument( + "--checkpoint", + required=True, + help="Path to model checkpoint (example: pokemon_card_lora/training_history.pt).", + ) + parser.add_argument( + "--generator-module", + default="", + help="Optional module path defining build_pipeline() and metadata_to_conditioning().", + ) + parser.add_argument("--device", default="cpu", help="Checkpoint loading device (default: cpu).") + parser.add_argument("--num-inference-steps", type=int, default=30) + parser.add_argument("--guidance-scale", type=float, default=7.5) + parser.add_argument("--save-path", default="generated_card.png") + parser.add_argument( + "--python-executable", + default=sys.executable, + help="Python executable used to run infer_json_usage.py (default: current interpreter).", + ) + parser.add_argument( + "--print-json", + action="store_true", + help="Print inferred JSON to stdout.", + ) + parser.add_argument( + "--print-clean-text", + action="store_true", + help="Print cleaned text to stdout.", + ) + return parser + + +def main() -> None: + args = _build_parser().parse_args() + print("main get clean text") + + get_clean_text = _load_function_from_file(args.text_cleaner_path, "get_clean_text") + print("main got clean text") + + cleaned_text = get_clean_text(args.text) + print("main got args.text") + if not isinstance(cleaned_text, str): + raise TypeError("get_clean_text(...) must return a string") + print("main get inferred") + + inferred_json = run_infer_json_cli( + infer_script_path=args.infer_script_path, + template_path=args.template, + cleaned_text=cleaned_text, + python_executable=args.python_executable, + ) + print("main got inferred") + print("main get generator") + + + + generator = CheckpointCardGenerator( + checkpoint_path=args.checkpoint, + device=args.device, + generator_module_path=args.generator_module, + ) + print("main got generator and will generate card") + + generator.generate_card_from_metadata( + inferred_json, + num_inference_steps=args.num_inference_steps, + guidance_scale=args.guidance_scale, + save_path=args.save_path, + ) + print("main card generated") + + + if args.print_clean_text: + print(cleaned_text) + if args.print_json: + print(json.dumps(inferred_json, indent=2)) + + print(f"Card generated and saved to: {args.save_path}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/text-cleaner/.ipynb_checkpoints/pokemon_text_cleaning-checkpoint.ipynb b/text-cleaner/.ipynb_checkpoints/pokemon_text_cleaning-checkpoint.ipynb new file mode 100644 index 0000000..e152832 --- /dev/null +++ b/text-cleaner/.ipynb_checkpoints/pokemon_text_cleaning-checkpoint.ipynb @@ -0,0 +1,298 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🎴 Génération de Carte Pokémon depuis un Texte Descriptif\n", + "## Partie 1 — Nettoyage du Texte (NLU Pipeline)\n", + "\n", + "On prend un texte descriptif fourni par l'utilisateur et on le nettoie étape par étape.\n", + "\n", + "```\n", + "Texte brut → Noise Removal → Tokenization → Stopwords → Lemmatization → Texte propre\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 📦 Installation des dépendances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mRunning cells with 'Python 3.12.3' requires the ipykernel package.\n", + "\u001b[1;31mCreate a Python Environment with the required packages.\n", + "\u001b[1;31mOr install 'ipykernel' using the command: '/usr/bin/python3 -m pip install ipykernel -U --user --force-reinstall'" + ] + } + ], + "source": [ + "!pip install nltk --quiet\n", + "\n", + "import nltk\n", + "nltk.download('punkt', quiet=True)\n", + "nltk.download('punkt_tab', quiet=True)\n", + "nltk.download('stopwords', quiet=True)\n", + "nltk.download('wordnet', quiet=True)\n", + "nltk.download('averaged_perceptron_tagger', quiet=True)\n", + "nltk.download('averaged_perceptron_tagger_eng', quiet=True)\n", + "\n", + "print(\"✅ Dépendances installées !\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 📝 Saisie du texte utilisateur" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_text = \"\"\"\n", + "This is a HUGE fire dragon!!! It has got massive red wings and shoots \n", + "powerfull flames from its mouth... It's super fast n really strong!!\n", + "Its body is coverd with shiny golden scales & it lives in volcanos.\n", + "it luv to fight other pokémons and is very very aggressive >:(\n", + "\"\"\"\n", + "\n", + "print(\"📄 Texte brut :\")\n", + "print(raw_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 🧹 Étape 1 — Noise Removal\n", + "\n", + "On supprime la ponctuation, les caractères spéciaux, les mots trop courts, et on met tout en minuscules.\n", + "\n", + "> 📖 *Cours page 25-29 — `removePunctuation`, `removeShortWords`, `removePattern`*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import string\n", + "\n", + "def remove_punctuation(text):\n", + " \"\"\"Supprime la ponctuation du texte.\"\"\"\n", + " mapping_table = text.maketrans('', '', string.punctuation)\n", + " return text.translate(mapping_table)\n", + "\n", + "def remove_special_chars(text):\n", + " \"\"\"Supprime les caractères non-ASCII (emojis, accents parasites...).\"\"\"\n", + " text = text.encode('ascii', 'ignore').decode('ascii')\n", + " text = re.sub(r'[^a-zA-Z\\s]', ' ', text)\n", + " return re.sub(r'\\s+', ' ', text).strip()\n", + "\n", + "def remove_short_words(text, min_len=3):\n", + " \"\"\"Supprime les mots de moins de min_len caractères.\"\"\"\n", + " return \" \".join([word for word in text.split() if len(word) >= min_len])\n", + "\n", + "# Application\n", + "text = raw_text.lower() # minuscules\n", + "text = remove_punctuation(text) # ponctuation\n", + "text = remove_special_chars(text) # caractères spéciaux\n", + "text = remove_short_words(text) # mots trop courts\n", + "\n", + "print(\"🔇 Après Noise Removal :\")\n", + "print(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 📖 Étape 2 — Object Standardization\n", + "\n", + "On remplace les abréviations et l'argot par leurs formes standard.\n", + "\n", + "> 📖 *Cours page 38 — lookup table `standardize`*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SLANG_LOOKUP = {\n", + " \"n\": \"and\",\n", + " \"luv\": \"love\",\n", + " \"r\": \"are\",\n", + " \"u\": \"you\",\n", + " \"ur\": \"your\",\n", + " \"gonna\": \"going to\",\n", + " \"wanna\": \"want to\",\n", + " \"gotta\": \"got to\",\n", + " \"pokemons\": \"pokemon\",\n", + " \"pokmons\": \"pokemon\",\n", + "}\n", + "\n", + "def standardize(text, lookup=SLANG_LOOKUP):\n", + " \"\"\"Remplace les mots d'argot par leur forme standard.\"\"\"\n", + " words = text.split()\n", + " return \" \".join([lookup.get(word, word) for word in words])\n", + "\n", + "text = standardize(text)\n", + "\n", + "print(\"📖 Après Standardisation :\")\n", + "print(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ✂️ Étape 3 — Tokenization\n", + "\n", + "On découpe le texte en tokens individuels.\n", + "\n", + "> 📖 *Cours page 31 — `word_tokenize` (NLTK)*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk import word_tokenize\n", + "\n", + "tokens = word_tokenize(text)\n", + "\n", + "print(f\"✂️ {len(tokens)} tokens :\")\n", + "print(tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 🚫 Étape 4 — Suppression des Stopwords\n", + "\n", + "On retire les mots grammaticaux qui n'apportent pas de sens (\"the\", \"is\", \"a\"...).\n", + "\n", + "> 📖 *Cours page 27 — `cleanTextGT` avec `stopwords` (NLTK)*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.corpus import stopwords\n", + "\n", + "stop_words = set(stopwords.words('english'))\n", + "\n", + "tokens = [token for token in tokens if token not in stop_words]\n", + "\n", + "print(\"🚫 Tokens après suppression des stopwords :\")\n", + "print(tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 🌿 Étape 5 — Lemmatization\n", + "\n", + "On réduit chaque mot à sa forme racine (`flames → flame`, `shooting → shoot`). On utilise le POS tag pour plus de précision.\n", + "\n", + "> 📖 *Cours page 36-37 — `WordNetLemmatizer` + POS tag*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "from nltk import pos_tag\n", + "from nltk.corpus import wordnet\n", + "\n", + "lem = WordNetLemmatizer()\n", + "\n", + "def get_wordnet_pos(treebank_tag):\n", + " \"\"\"Convertit les tags Penn Treebank en tags WordNet.\"\"\"\n", + " if treebank_tag.startswith('J'): return wordnet.ADJ\n", + " elif treebank_tag.startswith('V'): return wordnet.VERB\n", + " elif treebank_tag.startswith('N'): return wordnet.NOUN\n", + " elif treebank_tag.startswith('R'): return wordnet.ADV\n", + " else: return wordnet.NOUN\n", + "\n", + "pos_tags = pos_tag(tokens)\n", + "tokens = [lem.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]\n", + "\n", + "print(\"🌿 Tokens après Lemmatization :\")\n", + "print(tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ✅ Résultat final — Texte nettoyé" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clean_text = \" \".join(tokens)\n", + "\n", + "print(\"📄 Texte brut :\")\n", + "print(raw_text.strip())\n", + "print()\n", + "print(\"✅ Texte nettoyé :\")\n", + "print(clean_text)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/text-cleaner/.ipynb_checkpoints/text_cleaning_pipeline-checkpoint.py b/text-cleaner/.ipynb_checkpoints/text_cleaning_pipeline-checkpoint.py new file mode 100644 index 0000000..e27fb06 --- /dev/null +++ b/text-cleaner/.ipynb_checkpoints/text_cleaning_pipeline-checkpoint.py @@ -0,0 +1,158 @@ +"""Reusable text-cleaning pipeline for Pokemon descriptions. + +This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API: +- no input() calls +- no print side effects +- deterministic output for a given input +""" + +from __future__ import annotations + +import re +import string +from typing import Any, Dict, List + +SLANG_LOOKUP: Dict[str, str] = { + "n": "and", + "luv": "love", + "r": "are", + "u": "you", + "ur": "your", + "gonna": "going to", + "wanna": "want to", + "gotta": "got to", + "pokemons": "pokemon", + "pokmons": "pokemon", + "bcz": "because", +} + +_NLTK_RESOURCES = [ + "punkt", + "punkt_tab", + "stopwords", + "wordnet", + "averaged_perceptron_tagger", + "averaged_perceptron_tagger_eng", +] + + +def _import_nltk() -> Any: + """Import NLTK lazily so this module can be imported before deps are installed.""" + try: + import nltk # type: ignore + except ModuleNotFoundError as exc: + raise RuntimeError( + "NLTK is not installed. Install project dependencies with: pip install -r requirements.txt" + ) from exc + return nltk + + +def ensure_nltk_resources(quiet: bool = True) -> None: + """Download required NLTK resources if missing. + + Safe to call at app startup (including inside Streamlit). + """ + nltk = _import_nltk() + for resource in _NLTK_RESOURCES: + try: + nltk.download(resource, quiet=quiet) + except Exception as exc: + raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc + + +def remove_punctuation(text: str) -> str: + mapping_table = text.maketrans("", "", string.punctuation) + return text.translate(mapping_table) + + +def remove_special_chars(text: str) -> str: + text = text.encode("ascii", "ignore").decode("ascii") + text = re.sub(r"[^a-zA-Z\s]", " ", text) + return re.sub(r"\s+", " ", text).strip() + + +def remove_short_words(text: str, min_len: int = 3) -> str: + return " ".join(word for word in text.split() if len(word) >= min_len) + + +def remove_alphanum_words(text: str) -> str: + words = text.split() + cleaned = [ + word + for word in words + if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word)) + ] + return " ".join(cleaned) + + +def standardize(text: str, lookup: Dict[str, str] | None = None) -> str: + mapping = lookup or SLANG_LOOKUP + return " ".join(mapping.get(word, word) for word in text.split()) + + +def _get_wordnet_pos(treebank_tag: str) -> str: + nltk = _import_nltk() + wordnet = nltk.corpus.wordnet + if treebank_tag.startswith("J"): + return wordnet.ADJ + if treebank_tag.startswith("V"): + return wordnet.VERB + if treebank_tag.startswith("N"): + return wordnet.NOUN + if treebank_tag.startswith("R"): + return wordnet.ADV + return wordnet.NOUN + + +def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]: + """Run the full cleaning pipeline and return intermediate + final outputs. + + Returns a dictionary so a UI can display each stage if desired. + """ + if not isinstance(raw_text, str): + raise TypeError("raw_text must be a string") + + nltk = _import_nltk() + pos_tag = nltk.pos_tag + word_tokenize = nltk.word_tokenize + stopwords = nltk.corpus.stopwords + WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer + + ensure_nltk_resources(quiet=True) + + text = raw_text.lower() + text = remove_punctuation(text) + text = remove_alphanum_words(text) + text = remove_special_chars(text) + noise_removed = remove_short_words(text, min_len=min_len) + + standardized = standardize(noise_removed) + + tokens = word_tokenize(standardized) + + stop_words = set(stopwords.words("english")) + tokens_no_stopwords = [token for token in tokens if token not in stop_words] + + lem = WordNetLemmatizer() + pos_tags = pos_tag(tokens_no_stopwords) + lemmas = [ + lem.lemmatize(token, _get_wordnet_pos(tag)) + for token, tag in pos_tags + ] + + clean_text = " ".join(lemmas) + + return { + "raw_text": raw_text, + "noise_removed": noise_removed, + "standardized": standardized, + "tokens": tokens, + "tokens_no_stopwords": tokens_no_stopwords, + "lemmas": lemmas, + "clean_text": clean_text, + } + + +def get_clean_text(raw_text: str, min_len: int = 3) -> str: + """Small helper for app code that only needs the final cleaned text.""" + return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"] diff --git a/text-cleaner/__pycache__/text_cleaning_pipeline.cpython-312.pyc b/text-cleaner/__pycache__/text_cleaning_pipeline.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..319ffcabf50a74be555b6ecee58efe45b307160e GIT binary patch literal 7134 zcma)BYiu0Xb)Gvj`ODS+#6YvM7mCMai~kG27A3kQ{P% zW^-p&6gRuc)-f6}F+d5ZLnd`lrY+DIasd}LP!%aqHOasJA-!_Bod~Fns6g|N5|!3c zf3)Y^*;zhJH@VW@Idkv1kGbbN=bm%;FLiZc4$A%i{i*)rMvnUxzIY|#5@h9PK+JL% zIF%dXRHE`}lHrGVg0zqph6MH$heUXaX)0WFUXLjwyUvs#G#1Vq3%%Q@A!u5R9S6?mU?)$z_UT! zsqRu+p(hG0ZP3!FcB;G8Ezr`WcBxyTB?dg*>NaSJ1NDTu9jN9hA+aa_ALq23rCd&H zvaMaUds1mlF?3^8&gxk$ts9y=Vp936IiY1tLsm5_MfI$$n}*dJ4xb;>EjeSVIT)SM zDK)7j+hdw+n6`G=G$-WRY?iHMEm<*CS-YAwEe&XS(59M_N$Ymc2-P)1O;5>3&YpN7 z+ymof-N@$bJqbCbq|=t$l%=|1%a*QcvNkfJrEF|hHCv+@-Ow#tPswJ^hGA^Wiae@c z(F}K@@UKyB3F1hK0h?3U2Mit|kIpR@0Wjo?2n8OiZkB*o&uV_G0sHV)EBSJ;< z!BGh@?}ld8Kb5G4VctG^av>+bi=X% zCYstyaUTQui)=EO0A#(US3cn~%2{fT1AcG@z@28KbO7X}ZjZfyu=hsDJyfRJ*K<16 zGMZsqz4lcbAR`)Sd%}sGX0RVH?PF%nQ2P-4PUKw9u=R|_-V!|B3A=Fw?P|)xeaZ6U zm-mgC8Ev0xKHH;=r?dOUb2PrazzM$C-TJMS$moi|H-tZHXj=C1q1gW{3()b{ zio(#h#UOVrv`e@VX%~Lv+a;u`GJ`IT^TUtH{`)}8a(1onCiX%BdDrBBlpAh=7Cyuk z$aoC}E+5tfzQBD*ZdUmT_imfWO$mS?T!j@tP`oJEA@5z~3;dsPKjLdR5&%a+QTV2? z4oBIe=A@A}71fiO>OzWY7AS8D1gwvMk+Lk%VK7}XltE<#te(kijI7wQlFfpyDb&to z<(5XT4L(CqxKV~P&w!>Lqn6AXdg+Aqih2_F-d(~!iT zen?{Mkp-FW8%PV`+Q*MiHu5V}Yzk?73n>CaV+ulbB(R$)JI8JUC&c>FimfGhM?_6@>N0=^ zU_{HB)~fpJ9(NPi1GpcOYi6Lo4_o*H_*p2j5?6`0%?{icm_IO2KYVU!?@Nn&Un<8B zmrhrthMB|Dhv$0A(w+rz&kAmQ^F#Va==pyRiT5;Uym}lslULN*5fr&qOFUlFFprPp zR6!LBTGw zUbtlKP52x?*k4oC9KWiuCy%$TmAmZtP*m9jq0hns7J{XU_jpwRR#r>tN;;VuQ>aB> z0fwW%H@x^LAN!YPF?% zu5%7rb}va!E=o^A*Mj&IJqYuqY6cEoYDhj0FW(`va2alf&f z&ZE}gRv4h@0cZ1_1-}f$d*mcX7(j-bnE|fp(klL^`3cZ`i@e)ZAm1mm1dMGf z7+dadz;J#*2EZE<(wYG`8sEF$5z%9C0^ms{q0zz(%W{bU{&DEgsAhoQMi1w^yrUn3 zJ3Gmojow3P6TCF*aBqzbC17ob>Wly6z9?~@#QNvXe>`|A{&yE|KT&@6#Gf8rjP*aV zg23?YSO=L6bP&|Lad59d=vnXx;qFAC(-Sxm9=-fW>vhjsU7wzYesoK$4ya(cic~*y zaQa|5)<1v#uZMm-^p}_J#QMup|AN@>%CdG=z>gLLK9;Qf0*J3tI8Q=gbI4I{hEMUM zfQaG!c+uH4*=}a7R8pv^DsA~Kyl~NZYHb01gMasc$Q6YmTz8vVx|(Y>2)iNhEAo?I z?gsL~w_HUV&imfl-}C%10~6dQr{EhYnkg`YnnF`!6eR^6c-Y{VqVGswLps2m4^7>8 z(^SQ;q_bm+k;}LUpyy!70Q{_Bs6YU4pY%`nfA3VOZ`sc^Z~ZTy*x>&x61^L3nho6u z&F!7H??j*axase={Oy)o2XCLh^Lzadd|aq*SpdG$@nsGk=)yFh7_T2MOWPO3?T;Qv z&}^aZz4z>qc5!o!9E0xf@E}KWl~{hl+3@a`@A9CfHCcJB9CwYcX(`UVp?H%2owXF_ zU|NdiXpNSlQ$(iu&Fn-0Q02+Js;B4_F+HVyFb!(zs0O|#Jq~YG0afp(b(%_14e}k} z!x*YURXwk5qMSp(vraktJ`%4VpFMfw8{HO3ggnt^j z6YDEWeG6jWm(>&(fFD_T1(bJl9~At}wMH5SVA)J5i{OK9rckT*CXE#MakNhx${@Ne zn>Zb6%{yfyS^i-C(61a>Q(`upD=E49rybd=<1VVW2vSLhN zOvyY|Hnxg~>UAAW0(hCFfQU2+(LdE-el!)<+fbFbyHYFqrIq@onThF%rTU$V^*aF{ zf!I=@V=>S%XWa>ORvKDnzA^och2YjNw-=@h3&Cx_dwX>rscGiW^q~c@^~*Or3Xs9= zX8@_Sa4W0{bv>Q6q*W(t6+&+ly9N}`iU9C7g;i02_Dw)j6gL;(_$IxeaB! zX+s0Nj-$iAp%ndgtJkFRAJ$+=5b|gOHk8^o1?+8JS0Ug*vKX)tM69f9-LYxw&ehfe z`B3yuwivWKYdyi*+##MTh=ow?J#2T?xP0SJc=YO1^?AJBvnUn9aFa)>H#xYn#YiDi zkggE$mx6@|+`SUqy>-P1tmUtLYdyRPg}T}-4TT1rWhR;um`5OU+$BIvF;a{c>c=q_ z*iZ@)d#~42hj%Fyy_qC@ip3m>GWz|&Py$@)Ew399L0WdE05@glCEaf zFzJVJ=Q+lSNjw#@R1mQ$lx1xc`N9bZry)_OgCD0%$(oWH!c-Fc zEmWCIVz4D=a+Zx59tBff5OkfVcw2lxnE&SlAQ_d`tufb&2{=J-oKt6*x}`zLi}9H1 zMAlq9s^hmICkGK<%zY*eGr5`|^QW~;MzNeQ^K;lP60J^HhwK1OlEM(grcUVmRMySr zxCRpPd={EIrw)CkB%2v-S^72DE=7me5h2r=aSdqW8^AN5*{5(R8`)S$?)cK?B+wW# zGS7)cF3gK%wcHr8Dt;Plp8q8gvyFRg(BZjs_wXZeAN^tJ4X5~7=Vjf z!*Er*X@{2UV}B*wk#^s$Z(V46x?KOvLhzY;t42RPod$CsZ(E9YFUGqeN{+^7r5n;x zv}-ZiRXR~=+y3XFAB2|L5{qq#rM4Fr+g@A{o9~HR=EYCNuFs?xhSX(g8-{W1b81=Y zL`t5Im!)o`cFc=qsSBuh_q_el8+YO_l}_J_b<7pZv7XYY&+E3%9V*xDDIL4l7{7jT zHhCjCf3e)WuiUu5)L-d+=3lp-oxgC)cYElQgJ*vD?84TwaA83Gd3akn++KS1-x^vL zTAwU8>@B?pzV5cp|Mm&>{?8*V*ZQtMy)1B{$V}aI-F59wd(Tf&<@N)g2B8%=O2_VV z@BlBjsd={TM%!F>xp8NyAKm5l%9id*+wMwJTV=-qtXt&!4Z(-j~=vg+y0O| zS_#5v3r+ja_l5de#GgMeLj8+aSKl`OI?q@H7Iu^E0%HA529~zwcd9&^cnI5t{E@*7 zBva)vEe$DSHB?_Jo;-YPi7uhR{n#VJ?Oq2;LD65YE*w zyDpVdkDkG*3erCY=J(;pGD;=x9>{a-qre^M;H~Y8(y;~c7`v(=SoFKLisI)b$oXon z%n{+Q4rz*B1^6kZp;bd0`gLew&5TNrU&E0=io#VB)`{xm2w?hF*<9ayEz=voly6?2 zn3oGOuCzJ~)>Lu14Qu+{Bd9G#{{)$Et7}vJ@aKqO*cUskO<3LSw>W`hQZ-Y_q!Ugi zM{;1bwPcdwH?ioBV%87r-xMaW*bz}5_F)2udGnBkOs@Gu2#kFaySlOJ#fsUcSCGQs z+O<;X#WT;1iIj=z1CGgq*@DNxL+)RK4%2sm6Lg=mz6%xP2nl(>`-%9QC`USf#f5&! z)%`oSYmwUpXb(43q{d399{-1`pk*sXxNn#}|ke-JTLQayEV1TqgjPv9eSqV#|?cw%b;{VNM!jt2K H?e+fvx4@Az literal 0 HcmV?d00001 diff --git a/text-cleaner/pokemon_text_cleaning.ipynb b/text-cleaner/pokemon_text_cleaning.ipynb new file mode 100644 index 0000000..b92d54d --- /dev/null +++ b/text-cleaner/pokemon_text_cleaning.ipynb @@ -0,0 +1,451 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Partie 1 — Nettoyage du Texte\n", + "source .venv/bin/activate\n", + "cd \n", + "python nom du fichier\n", + "On prend un texte descriptif fourni par l'utilisateur et on le nettoie étape par étape.\n", + "\n", + "```\n", + "Texte brut → Noise Removal → Tokenization → Stopwords → Lemmatization → Texte propre\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Installation des dépendances" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dépendances installées !\n" + ] + } + ], + "source": [ + "!pip install nltk --quiet\n", + "\n", + "import nltk\n", + "nltk.download('punkt', quiet=True)\n", + "nltk.download('punkt_tab', quiet=True)\n", + "nltk.download('stopwords', quiet=True)\n", + "nltk.download('wordnet', quiet=True)\n", + "nltk.download('averaged_perceptron_tagger', quiet=True)\n", + "nltk.download('averaged_perceptron_tagger_eng', quiet=True)\n", + "\n", + "print(\"Dépendances installées !\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Saisie du texte utilisateur" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "Interrupted by user", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[86]\u001b[39m\u001b[32m, line 65\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# test_texts = [\u001b[39;00m\n\u001b[32m 2\u001b[39m \n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# # 0 — Dragon de feu (texte original)\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 62\u001b[39m \n\u001b[32m 63\u001b[39m \u001b[38;5;66;03m# print(f\" Texte de test n°{INDEX} :\")\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m65\u001b[39m raw_text = \u001b[38;5;28;43minput\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mDécrivez votre Pokémon : \u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 67\u001b[39m \u001b[38;5;28mprint\u001b[39m(raw_text)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/lib/python3.12/site-packages/ipykernel/kernelbase.py:1403\u001b[39m, in \u001b[36mKernel.raw_input\u001b[39m\u001b[34m(self, prompt)\u001b[39m\n\u001b[32m 1401\u001b[39m msg = \u001b[33m\"\u001b[39m\u001b[33mraw_input was called, but this frontend does not support input requests.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1402\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m StdinNotImplementedError(msg)\n\u001b[32m-> \u001b[39m\u001b[32m1403\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_input_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1404\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1405\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_get_shell_context_var\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_shell_parent_ident\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1406\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_parent\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mshell\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1407\u001b[39m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 1408\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/lib/python3.12/site-packages/ipykernel/kernelbase.py:1448\u001b[39m, in \u001b[36mKernel._input_request\u001b[39m\u001b[34m(self, prompt, ident, parent, password)\u001b[39m\n\u001b[32m 1445\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[32m 1446\u001b[39m \u001b[38;5;66;03m# re-raise KeyboardInterrupt, to truncate traceback\u001b[39;00m\n\u001b[32m 1447\u001b[39m msg = \u001b[33m\"\u001b[39m\u001b[33mInterrupted by user\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m-> \u001b[39m\u001b[32m1448\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m(msg) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1449\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[32m 1450\u001b[39m \u001b[38;5;28mself\u001b[39m.log.warning(\u001b[33m\"\u001b[39m\u001b[33mInvalid Message:\u001b[39m\u001b[33m\"\u001b[39m, exc_info=\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "\u001b[31mKeyboardInterrupt\u001b[39m: Interrupted by user" + ] + } + ], + "source": [ + "# test_texts = [\n", + "\n", + "# # 0 — Dragon de feu (texte original)\n", + "# \"\"\"\n", + "# This is a HUGE fire dragon!!! It has got massive red wings and shoots\n", + "# powerfull flames from its mouth... It's super fast n really strong!!\n", + "# Its body is coverd with shiny golden scales & it lives in volcanos.\n", + "# it luv to fight other pokémons and is very very aggressive >:(\n", + "# I want to call it Pyrokar.\n", + "# \"\"\",\n", + "\n", + "# # 1 — Pokémon aquatique calme\n", + "# \"\"\"\n", + "# My pokemon is called Aqualis!! its a small blue sea creature w/ big\n", + "# shiny eyes... very calm n gentle :) it swims super fast in deep oceans\n", + "# and can breath underwater 4ever. it glows in the dark like a lanternfish\n", + "# and heals other pokemons with its tears!!! luv this lil guy so much omg\n", + "# \"\"\",\n", + "\n", + "# # 2 — Pokémon électrique agressif\n", + "# \"\"\"\n", + "# ZAPTHORN is da name!! its an electric wolf w/ yellow n black fur and\n", + "# giant thunder claws !!! it runz at lightning speed thru storms & shoots\n", + "# bolts from its tail... super scary n powerfull enemy 4 sure >:D\n", + "# nobody can catch it bcz it disappears in the clouds when threatened\n", + "# \"\"\",\n", + "\n", + "# # 3 — Pokémon plante timide\n", + "# \"\"\"\n", + "# i wanna name it Sylverion... its a shy deer-like pokemon covered in\n", + "# beautiful flowers n vines. it lives deep in enchanted forests & only\n", + "# comes out at nite. its antlers r made of ancient wood n bloom every\n", + "# spring!! it can make plants grow super fast around it... so magical omg\n", + "# \"\"\",\n", + "\n", + "# # 4 — Pokémon glace / fantôme\n", + "# \"\"\"\n", + "# This haunted ice spirit is called Glacyra!!! it floats thru frozen\n", + "# mountains leavin icy footprints everywhere... its body is trasnparent\n", + "# like glass n u can see its frozen heart inside >< it whispers 2 trainers\n", + "# in their sleep n freezes everything it touchez. very misunderstood tbh\n", + "# \"\"\",\n", + "\n", + "# # 5 — Pokémon combat en franglais\n", + "# \"\"\"\n", + "# My Pokémon is called Ferroknux!! It's a big metal gorilla with\n", + "# gigantic iron fists and super thick armor on its chest... it smashes\n", + "# rocks with bare hands and trains all day, every day in the mountains!!\n", + "# Very strong and very aggressive, but loyal to its trainer 4ever :)\n", + "# \"\"\",\n", + "# #6 \n", + "# \"\"\"\n", + "# Furret is a long, slender, and agile creature with soft fur and a flexible body that allows it to move gracefully through narrow tunnels and hidden pathways. This normal-type Pokémon builds intricate nests perfectly shaped to fit its elongated form, making them nearly impossible for other creatures to enter. Despite its gentle and calm nature, Furret can become surprisingly energetic in battle, using its powerful tail to smash opponents with swift and playful attacks. It is often seen wandering across fields and forests, curiously observing its surroundings, and it shares a close bond with its pre-evolution, Sentret. Known for its endurance and cheerful spirit, Furret can quickly recover its energy, always feeling fine and ready to continue exploring or fighting alongside its trainer.\n", + "# \"\"\",\n", + "\n", + "# ]\n", + "\n", + "# # 👇 Changez cet index pour tester un autre texte\n", + "# INDEX = 6\n", + "\n", + "# raw_text = test_texts[INDEX]\n", + "\n", + "# print(f\" Texte de test n°{INDEX} :\")\n", + "\n", + "raw_text = input(\"Décrivez votre Pokémon : \")\n", + "\n", + "print(raw_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Étape 1 — Noise Removal\n", + "\n", + "On supprime la ponctuation, les caractères spéciaux, les mots trop courts, et on met tout en minuscules.\n", + "\n", + "> *Cours page 25-29 — `removePunctuation`, `removeShortWords`, `removePattern`*" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Après Noise Removal :\n", + "furret long slender and agile creature with soft fur and flexible body that allows move gracefully through narrow tunnels and hidden pathways this normaltype pokmon builds intricate nests perfectly shaped fit its elongated form making them nearly impossible for other creatures enter despite its gentle and calm nature furret can become surprisingly energetic battle using its powerful tail smash opponents with swift and playful attacks often seen wandering across fields and forests curiously observing its surroundings and shares close bond with its preevolution sentret known for its endurance and cheerful spirit furret can quickly recover its energy always feeling fine and ready continue exploring fighting alongside its trainer\n" + ] + } + ], + "source": [ + "import re\n", + "import string\n", + "\n", + "def remove_punctuation(text):\n", + " \"\"\"Supprime la ponctuation du texte.\"\"\"\n", + " mapping_table = text.maketrans('', '', string.punctuation)\n", + " return text.translate(mapping_table)\n", + "\n", + "def remove_special_chars(text):\n", + " \"\"\"Supprime les caractères non-ASCII (emojis, accents parasites...).\"\"\"\n", + " text = text.encode('ascii', 'ignore').decode('ascii')\n", + " text = re.sub(r'[^a-zA-Z\\s]', ' ', text)\n", + " return re.sub(r'\\s+', ' ', text).strip()\n", + "\n", + "def remove_short_words(text, min_len=3):\n", + " \"\"\"Supprime les mots de moins de min_len caractères.\"\"\"\n", + " return \" \".join([word for word in text.split() if len(word) >= min_len])\n", + "\n", + "\n", + "def remove_alphanum_words(text):\n", + " \"\"\"Supprime les mots qui contiennent à la fois des lettres et des chiffres\n", + " (ex: '4ever', 'n1', '2night', 'runz4', 'mp3').\"\"\"\n", + " words = text.split()\n", + " cleaned = [word for word in words\n", + " if not (re.search(r'[a-zA-Z]', word) and re.search(r'[0-9]', word))]\n", + " return \" \".join(cleaned)\n", + "\n", + "# Application\n", + "text = raw_text.lower() # minuscules\n", + "text = remove_punctuation(text) # ponctuation\n", + "text = remove_alphanum_words(text) \n", + "text = remove_special_chars(text) # caractères spéciaux\n", + "text = remove_short_words(text) # mots trop courts\n", + "\n", + "print(\" Après Noise Removal :\")\n", + "print(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Étape 2 — Object Standardization\n", + "\n", + "On remplace les abréviations et l'argot par leurs formes standard.\n", + "\n", + "> *Cours page 38 — lookup table `standardize`*" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Après Standardisation :\n", + "furret long slender and agile creature with soft fur and flexible body that allows move gracefully through narrow tunnels and hidden pathways this normaltype pokmon builds intricate nests perfectly shaped fit its elongated form making them nearly impossible for other creatures enter despite its gentle and calm nature furret can become surprisingly energetic battle using its powerful tail smash opponents with swift and playful attacks often seen wandering across fields and forests curiously observing its surroundings and shares close bond with its preevolution sentret known for its endurance and cheerful spirit furret can quickly recover its energy always feeling fine and ready continue exploring fighting alongside its trainer\n" + ] + } + ], + "source": [ + "SLANG_LOOKUP = {\n", + " \"n\": \"and\",\n", + " \"luv\": \"love\",\n", + " \"r\": \"are\",\n", + " \"u\": \"you\",\n", + " \"ur\": \"your\",\n", + " \"gonna\": \"going to\",\n", + " \"wanna\": \"want to\",\n", + " \"gotta\": \"got to\",\n", + " \"pokemons\": \"pokemon\",\n", + " \"pokmons\": \"pokemon\",\n", + " \"bcz\": \"because\",\n", + "}\n", + "\n", + "def standardize(text, lookup=SLANG_LOOKUP):\n", + " \"\"\"Remplace les mots d'argot par leur forme standard.\"\"\"\n", + " words = text.split()\n", + " return \" \".join([lookup.get(word, word) for word in words])\n", + "\n", + "text = standardize(text)\n", + "\n", + "print(\" Après Standardisation :\")\n", + "print(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Étape 3 — Tokenization\n", + "\n", + "On découpe le texte en tokens individuels.\n", + "\n", + "> *Cours page 31 — `word_tokenize` (NLTK)*" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 108 tokens :\n", + "['furret', 'long', 'slender', 'and', 'agile', 'creature', 'with', 'soft', 'fur', 'and', 'flexible', 'body', 'that', 'allows', 'move', 'gracefully', 'through', 'narrow', 'tunnels', 'and', 'hidden', 'pathways', 'this', 'normaltype', 'pokmon', 'builds', 'intricate', 'nests', 'perfectly', 'shaped', 'fit', 'its', 'elongated', 'form', 'making', 'them', 'nearly', 'impossible', 'for', 'other', 'creatures', 'enter', 'despite', 'its', 'gentle', 'and', 'calm', 'nature', 'furret', 'can', 'become', 'surprisingly', 'energetic', 'battle', 'using', 'its', 'powerful', 'tail', 'smash', 'opponents', 'with', 'swift', 'and', 'playful', 'attacks', 'often', 'seen', 'wandering', 'across', 'fields', 'and', 'forests', 'curiously', 'observing', 'its', 'surroundings', 'and', 'shares', 'close', 'bond', 'with', 'its', 'preevolution', 'sentret', 'known', 'for', 'its', 'endurance', 'and', 'cheerful', 'spirit', 'furret', 'can', 'quickly', 'recover', 'its', 'energy', 'always', 'feeling', 'fine', 'and', 'ready', 'continue', 'exploring', 'fighting', 'alongside', 'its', 'trainer']\n" + ] + } + ], + "source": [ + "from nltk import word_tokenize\n", + "\n", + "tokens = word_tokenize(text)\n", + "\n", + "print(f\" {len(tokens)} tokens :\")\n", + "print(tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Étape 4 — Suppression des Stopwords\n", + "\n", + "On retire les mots grammaticaux qui n'apportent pas de sens (\"the\", \"is\", \"a\"...).\n", + "\n", + "> *Cours page 27 — `cleanTextGT` avec `stopwords` (NLTK)*" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tokens après suppression des stopwords :\n", + "['furret', 'long', 'slender', 'agile', 'creature', 'soft', 'fur', 'flexible', 'body', 'allows', 'move', 'gracefully', 'narrow', 'tunnels', 'hidden', 'pathways', 'normaltype', 'pokmon', 'builds', 'intricate', 'nests', 'perfectly', 'shaped', 'fit', 'elongated', 'form', 'making', 'nearly', 'impossible', 'creatures', 'enter', 'despite', 'gentle', 'calm', 'nature', 'furret', 'become', 'surprisingly', 'energetic', 'battle', 'using', 'powerful', 'tail', 'smash', 'opponents', 'swift', 'playful', 'attacks', 'often', 'seen', 'wandering', 'across', 'fields', 'forests', 'curiously', 'observing', 'surroundings', 'shares', 'close', 'bond', 'preevolution', 'sentret', 'known', 'endurance', 'cheerful', 'spirit', 'furret', 'quickly', 'recover', 'energy', 'always', 'feeling', 'fine', 'ready', 'continue', 'exploring', 'fighting', 'alongside', 'trainer']\n" + ] + } + ], + "source": [ + "from nltk.corpus import stopwords\n", + "\n", + "stop_words = set(stopwords.words('english'))\n", + "\n", + "tokens = [token for token in tokens if token not in stop_words]\n", + "\n", + "print(\"Tokens après suppression des stopwords :\")\n", + "print(tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Étape 5 — Lemmatization\n", + "\n", + "On réduit chaque mot à sa forme racine (`flames → flame`, `shooting → shoot`). On utilise le POS tag pour plus de précision.\n", + "\n", + "> *Cours page 36-37 — `WordNetLemmatizer` + POS tag*" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Tokens après Lemmatization :\n", + "['furret', 'long', 'slender', 'agile', 'creature', 'soft', 'fur', 'flexible', 'body', 'allow', 'move', 'gracefully', 'narrow', 'tunnel', 'hide', 'pathway', 'normaltype', 'pokmon', 'build', 'intricate', 'nest', 'perfectly', 'shape', 'fit', 'elongated', 'form', 'make', 'nearly', 'impossible', 'creature', 'enter', 'despite', 'gentle', 'calm', 'nature', 'furret', 'become', 'surprisingly', 'energetic', 'battle', 'use', 'powerful', 'tail', 'smash', 'opponent', 'swift', 'playful', 'attack', 'often', 'see', 'wander', 'across', 'field', 'forest', 'curiously', 'observe', 'surroundings', 'share', 'close', 'bond', 'preevolution', 'sentret', 'know', 'endurance', 'cheerful', 'spirit', 'furret', 'quickly', 'recover', 'energy', 'always', 'feel', 'fine', 'ready', 'continue', 'explore', 'fight', 'alongside', 'trainer']\n" + ] + } + ], + "source": [ + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "from nltk import pos_tag\n", + "from nltk.corpus import wordnet\n", + "\n", + "lem = WordNetLemmatizer()\n", + "\n", + "def get_wordnet_pos(treebank_tag):\n", + " \"\"\"Convertit les tags Penn Treebank en tags WordNet.\"\"\"\n", + " if treebank_tag.startswith('J'): return wordnet.ADJ\n", + " elif treebank_tag.startswith('V'): return wordnet.VERB\n", + " elif treebank_tag.startswith('N'): return wordnet.NOUN\n", + " elif treebank_tag.startswith('R'): return wordnet.ADV\n", + " else: return wordnet.NOUN\n", + "\n", + "pos_tags = pos_tag(tokens)\n", + "tokens = [lem.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]\n", + "\n", + "print(\" Tokens après Lemmatization :\")\n", + "print(tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Résultat final — Texte nettoyé" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Texte brut :\n", + "Furret is a long, slender, and agile creature with soft fur and a flexible body that allows it to move gracefully through narrow tunnels and hidden pathways. This normal-type Pokémon builds intricate nests perfectly shaped to fit its elongated form, making them nearly impossible for other creatures to enter. Despite its gentle and calm nature, Furret can become surprisingly energetic in battle, using its powerful tail to smash opponents with swift and playful attacks. It is often seen wandering across fields and forests, curiously observing its surroundings, and it shares a close bond with its pre-evolution, Sentret. Known for its endurance and cheerful spirit, Furret can quickly recover its energy, always feeling fine and ready to continue exploring or fighting alongside its trainer.\n", + "\n", + "Texte nettoyé :\n", + "furret long slender agile creature soft fur flexible body allow move gracefully narrow tunnel hide pathway normaltype pokmon build intricate nest perfectly shape fit elongated form make nearly impossible creature enter despite gentle calm nature furret become surprisingly energetic battle use powerful tail smash opponent swift playful attack often see wander across field forest curiously observe surroundings share close bond preevolution sentret know endurance cheerful spirit furret quickly recover energy always feel fine ready continue explore fight alongside trainer\n" + ] + } + ], + "source": [ + "clean_text = \" \".join(tokens)\n", + "\n", + "print(\"📄 Texte brut :\")\n", + "print(raw_text.strip())\n", + "print()\n", + "print(\"Texte nettoyé :\")\n", + "print(clean_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/text-cleaner/text_cleaning_pipeline.py b/text-cleaner/text_cleaning_pipeline.py new file mode 100644 index 0000000..e27fb06 --- /dev/null +++ b/text-cleaner/text_cleaning_pipeline.py @@ -0,0 +1,158 @@ +"""Reusable text-cleaning pipeline for Pokemon descriptions. + +This module mirrors the notebook cleaning steps and exposes a Streamlit-friendly API: +- no input() calls +- no print side effects +- deterministic output for a given input +""" + +from __future__ import annotations + +import re +import string +from typing import Any, Dict, List + +SLANG_LOOKUP: Dict[str, str] = { + "n": "and", + "luv": "love", + "r": "are", + "u": "you", + "ur": "your", + "gonna": "going to", + "wanna": "want to", + "gotta": "got to", + "pokemons": "pokemon", + "pokmons": "pokemon", + "bcz": "because", +} + +_NLTK_RESOURCES = [ + "punkt", + "punkt_tab", + "stopwords", + "wordnet", + "averaged_perceptron_tagger", + "averaged_perceptron_tagger_eng", +] + + +def _import_nltk() -> Any: + """Import NLTK lazily so this module can be imported before deps are installed.""" + try: + import nltk # type: ignore + except ModuleNotFoundError as exc: + raise RuntimeError( + "NLTK is not installed. Install project dependencies with: pip install -r requirements.txt" + ) from exc + return nltk + + +def ensure_nltk_resources(quiet: bool = True) -> None: + """Download required NLTK resources if missing. + + Safe to call at app startup (including inside Streamlit). + """ + nltk = _import_nltk() + for resource in _NLTK_RESOURCES: + try: + nltk.download(resource, quiet=quiet) + except Exception as exc: + raise RuntimeError(f"Failed to download NLTK resource: {resource}") from exc + + +def remove_punctuation(text: str) -> str: + mapping_table = text.maketrans("", "", string.punctuation) + return text.translate(mapping_table) + + +def remove_special_chars(text: str) -> str: + text = text.encode("ascii", "ignore").decode("ascii") + text = re.sub(r"[^a-zA-Z\s]", " ", text) + return re.sub(r"\s+", " ", text).strip() + + +def remove_short_words(text: str, min_len: int = 3) -> str: + return " ".join(word for word in text.split() if len(word) >= min_len) + + +def remove_alphanum_words(text: str) -> str: + words = text.split() + cleaned = [ + word + for word in words + if not (re.search(r"[a-zA-Z]", word) and re.search(r"[0-9]", word)) + ] + return " ".join(cleaned) + + +def standardize(text: str, lookup: Dict[str, str] | None = None) -> str: + mapping = lookup or SLANG_LOOKUP + return " ".join(mapping.get(word, word) for word in text.split()) + + +def _get_wordnet_pos(treebank_tag: str) -> str: + nltk = _import_nltk() + wordnet = nltk.corpus.wordnet + if treebank_tag.startswith("J"): + return wordnet.ADJ + if treebank_tag.startswith("V"): + return wordnet.VERB + if treebank_tag.startswith("N"): + return wordnet.NOUN + if treebank_tag.startswith("R"): + return wordnet.ADV + return wordnet.NOUN + + +def clean_pokemon_text(raw_text: str, min_len: int = 3) -> Dict[str, Any]: + """Run the full cleaning pipeline and return intermediate + final outputs. + + Returns a dictionary so a UI can display each stage if desired. + """ + if not isinstance(raw_text, str): + raise TypeError("raw_text must be a string") + + nltk = _import_nltk() + pos_tag = nltk.pos_tag + word_tokenize = nltk.word_tokenize + stopwords = nltk.corpus.stopwords + WordNetLemmatizer = nltk.stem.wordnet.WordNetLemmatizer + + ensure_nltk_resources(quiet=True) + + text = raw_text.lower() + text = remove_punctuation(text) + text = remove_alphanum_words(text) + text = remove_special_chars(text) + noise_removed = remove_short_words(text, min_len=min_len) + + standardized = standardize(noise_removed) + + tokens = word_tokenize(standardized) + + stop_words = set(stopwords.words("english")) + tokens_no_stopwords = [token for token in tokens if token not in stop_words] + + lem = WordNetLemmatizer() + pos_tags = pos_tag(tokens_no_stopwords) + lemmas = [ + lem.lemmatize(token, _get_wordnet_pos(tag)) + for token, tag in pos_tags + ] + + clean_text = " ".join(lemmas) + + return { + "raw_text": raw_text, + "noise_removed": noise_removed, + "standardized": standardized, + "tokens": tokens, + "tokens_no_stopwords": tokens_no_stopwords, + "lemmas": lemmas, + "clean_text": clean_text, + } + + +def get_clean_text(raw_text: str, min_len: int = 3) -> str: + """Small helper for app code that only needs the final cleaned text.""" + return clean_pokemon_text(raw_text, min_len=min_len)["clean_text"]