first commit

2026-03-19 18:16:20 +01:00
commit 584b2e07b4
34 changed files with 4381 additions and 0 deletions
--- a/clean-text-to-keywords/.ipynb_checkpoints/README-checkpoint.md
+++ b/clean-text-to-keywords/.ipynb_checkpoints/README-checkpoint.md
@@ -0,0 +1,189 @@
+# Pokemon Text-to-JSON Pipeline
+
+This project converts free-form Pokemon description text into:
+
+1. A normalized keyword list
+2. A populated Pokemon JSON object (from a blank/key-only template)
+
+The pipeline is deterministic and rule-based.
+
+## Architecture
+
+### Stage 1: Keyword Extraction
+
+File: `keyword_extractor.py`
+
+Input: raw text description
+
+Core logic:
+
+- spaCy tokenization and POS tagging
+- POS filtering (`NOUN`, `ADJ`, `VERB`)
+- stopword and punctuation removal
+- lemma-based normalization
+- domain synonym normalization (example: `flames -> fire`)
+- optional YAKE relevance scoring
+- conservative retention policy so detail is not over-pruned
+
+Output: ordered list of normalized keywords
+
+### Stage 2: JSON Inference
+
+File: `json_inference.py`
+
+Input: keyword list + optional JSON template
+
+Core logic:
+
+- infer primary/secondary type
+- infer name candidate
+- infer attacks, abilities, habitat, personality
+- infer basic stats (`hp`, `attack`, `defense`, `speed`)
+- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.)
+- preserve already non-empty values in the provided template
+
+Output: inferred JSON profile
+
+### Stage 3: Orchestration CLI
+
+File: `infer_json_usage.py`
+
+This is the main entrypoint for end-to-end usage.
+
+Default behavior:
+
+1. prints extracted keyword list
+2. prints inferred JSON
+
+## Project Structure
+
+- `keyword_extractor.py`: keyword extraction engine
+- `json_inference.py`: keyword-to-JSON inference logic
+- `infer_json_usage.py`: end-to-end CLI
+- `example_usage.py`: keyword extraction only CLI
+- `json_template_example.json`: sample blank/key-only template
+- `test_keyword_extractor.py`: extraction tests
+- `test_json_inference.py`: inference tests
+- `requirements.txt`: Python dependencies
+
+## Requirements
+
+- Python 3.13 or lower is recommended for spaCy compatibility
+- pip
+
+Dependencies in `requirements.txt`:
+
+- `spacy>=3.7.0`
+- `yake>=0.4.2`
+
+## Setup
+
+1. Create and activate a virtual environment (recommended)
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+```
+
+2. Install dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+3. Install spaCy English model
+
+```bash
+python -m spacy download en_core_web_sm
+```
+
+## How To Run
+
+### A) Extract keywords only
+
+```bash
+python example_usage.py "furret long slender agile creature with soft fur"
+```
+
+Output: JSON list of keywords.
+
+### B) End-to-end: text -> keywords -> JSON
+
+```bash
+python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur"
+```
+
+Output order:
+
+1. keyword list
+2. inferred JSON
+
+### C) End-to-end but JSON only
+
+```bash
+python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur"
+```
+
+### D) Start from keywords directly
+
+```bash
+python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance
+```
+
+Tip: If you pass `--keywords`, text extraction is skipped.
+
+## Template Behavior
+
+If `--template` is omitted, inference returns a full inferred profile object.
+
+If `--template` is provided:
+
+- empty fields are populated from inferred values
+- non-empty fields are preserved
+
+Current sample template supports nested card-like data including:
+
+- `types`
+- `attacks` with `cost`, `name`, `effect`, `damage`
+- `weaknesses` with `type`, `value`
+- `stage`, `retreat`, `legal`
+
+## Tests
+
+Run all tests:
+
+```bash
+python -m unittest -q
+```
+
+## Troubleshooting
+
+### 1) spaCy model not found
+
+Error mentions `en_core_web_sm` not installed.
+
+Fix:
+
+```bash
+python -m spacy download en_core_web_sm
+```
+
+### 2) spaCy import/runtime problems on very new Python versions
+
+Use Python 3.13 or lower and reinstall requirements.
+
+### 3) `--template` path errors
+
+Ensure `--template` points to a valid file path, for example:
+
+```bash
+--template json_template_example.json
+```
+
+If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`.
+
+## Design Notes
+
+- deterministic and explainable (no LLM calls)
+- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py`
+- scoring and template fill rules are intentionally simple and stable for game-content generation
--- a/clean-text-to-keywords/.ipynb_checkpoints/example_usage-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/example_usage-checkpoint.py
@@ -0,0 +1,36 @@
+import argparse
+import json
+from typing import Sequence
+
+from keyword_extractor import KeywordExtractor
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Extract normalized keywords from cleaned text.",
+    )
+    parser.add_argument(
+        "text",
+        nargs="+",
+        help="Input text to process. Pass as one quoted string or multiple words.",
+    )
+    parser.add_argument(
+        "--model",
+        default="en_core_web_sm",
+        help="spaCy model name (default: en_core_web_sm).",
+    )
+    return parser
+
+
+def main(argv: Sequence[str] | None = None) -> None:
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+
+    text = " ".join(args.text)
+    extractor = KeywordExtractor.from_default_model(model_name=args.model)
+    keywords = extractor.extract(text)
+    print(json.dumps(keywords))
+
+
+if __name__ == "__main__":
+    main()
--- a/clean-text-to-keywords/.ipynb_checkpoints/infer_json_usage-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/infer_json_usage-checkpoint.py
--- a/clean-text-to-keywords/.ipynb_checkpoints/json_inference-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/json_inference-checkpoint.py
--- a/clean-text-to-keywords/.ipynb_checkpoints/json_template_example-checkpoint.json
+++ b/clean-text-to-keywords/.ipynb_checkpoints/json_template_example-checkpoint.json
--- a/clean-text-to-keywords/.ipynb_checkpoints/keyword_extractor-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/keyword_extractor-checkpoint.py
@@ -0,0 +1,137 @@
+"""Rule-based keyword extraction and normalization for Pokemon card generation."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple
+
+# Canonical concept -> synonym list
+from typing import Dict, List
+
+DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = {
+    "normal": ["basic", "common", "regular", "plain"],
+    "fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"],
+    "water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"],
+    "grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"],
+    "flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"],
+    "fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"],
+    "poison": ["toxic", "venom", "acid", "poisonous", "toxin"],
+    "electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"],
+    "ground": ["earth", "soil", "sand", "mud", "quake", "dust"],
+    "rock": ["stone", "boulder", "crystal", "rocky", "pebble"],
+    "psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"],
+    "ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"],
+    "bug": ["insect", "ant", "beetle", "spider", "crawler"],
+    "ghost": ["spirit", "phantom", "haunt", "shadow", "specter"],
+    "steel": ["metal", "iron", "armor", "blade", "alloy"],
+    "dragon": ["drake", "wyrm", "serpent", "legendary"],
+    "dark": ["shadow", "evil", "night", "doom", "darkness"],
+    "fairy": ["magic", "magical", "sparkle", "light", "charm"],
+}
+
+DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB")
+
+
+def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]:
+    """Build synonym -> canonical mapping for O(1) normalization lookup."""
+    inverse: Dict[str, str] = {}
+    for canonical, synonyms in normalization_map.items():
+        canonical_normalized = canonical.strip().lower()
+        inverse[canonical_normalized] = canonical_normalized
+        for synonym in synonyms:
+            synonym_normalized = synonym.strip().lower()
+            if synonym_normalized:
+                inverse[synonym_normalized] = canonical_normalized
+    return inverse
+
+
+def _deduplicate_preserve_order(items: Iterable[str]) -> List[str]:
+    seen: Set[str] = set()
+    output: List[str] = []
+    for item in items:
+        if item not in seen:
+            seen.add(item)
+            output.append(item)
+    return output
+
+
+@dataclass
+class KeywordExtractor:
+    """Deterministic spaCy + rule-based keyword extraction pipeline."""
+
+    nlp: Any
+    normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP)
+    allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS)
+
+    def __post_init__(self) -> None:
+        self._normalization_lookup = _invert_normalization_map(self.normalization_map)
+        self._allowed_pos_set = set(self.allowed_pos)
+
+    @classmethod
+    def from_default_model(
+        cls,
+        model_name: str = "en_core_web_sm",
+        normalization_map: Optional[Mapping[str, Iterable[str]]] = None,
+        allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS,
+    ) -> "KeywordExtractor":
+        """Initialize extractor with a spaCy English pipeline."""
+        try:
+            import spacy
+
+            nlp = spacy.load(model_name)
+        except OSError as exc:
+            raise OSError(
+                f"spaCy model '{model_name}' is not installed. "
+                "Run: python -m spacy download en_core_web_sm"
+            ) from exc
+        except Exception as exc:
+            raise RuntimeError(
+                "spaCy could not be loaded in this Python environment. "
+                "Try Python 3.13 or lower, then install spaCy and en_core_web_sm."
+            ) from exc
+
+        return cls(
+            nlp=nlp,
+            normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP,
+            allowed_pos=allowed_pos,
+        )
+
+    def extract(self, text: str) -> List[str]:
+        """Extract and normalize keywords from already-cleaned text."""
+        if not text or not text.strip():
+            return []
+
+        doc = self.nlp(text)
+
+        # Step 1: POS filtering + base normalization to lowercase lemmas/tokens.
+        raw_keywords: List[str] = []
+        for token in doc:
+            if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set:
+                continue
+
+            # Use lemma where possible to collapse inflections.
+            base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip()
+            if base:
+                raw_keywords.append(base)
+
+        # Step 2: Deduplicate before domain normalization (as requested in README).
+        deduplicated = _deduplicate_preserve_order(raw_keywords)
+
+        # Step 3: Map variants/synonyms to canonical concepts.
+        normalized = [self._normalize_keyword(keyword) for keyword in deduplicated]
+
+        # Step 4: Deduplicate again, since multiple words can map to one concept.
+        return _deduplicate_preserve_order(normalized)
+
+    def _normalize_keyword(self, keyword: str) -> str:
+        keyword_lower = keyword.lower()
+        return self._normalization_lookup.get(keyword_lower, keyword_lower)
+
+
+def extract_keywords(
+    text: str,
+    extractor: Optional[KeywordExtractor] = None,
+) -> List[str]:
+    """Convenience API to extract keywords with default extractor config."""
+    active_extractor = extractor or KeywordExtractor.from_default_model()
+    return active_extractor.extract(text)
--- a/clean-text-to-keywords/.ipynb_checkpoints/requirements-checkpoint.txt
+++ b/clean-text-to-keywords/.ipynb_checkpoints/requirements-checkpoint.txt
--- a/clean-text-to-keywords/.ipynb_checkpoints/test_json_inference-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/test_json_inference-checkpoint.py
--- a/clean-text-to-keywords/.ipynb_checkpoints/test_keyword_extractor-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/test_keyword_extractor-checkpoint.py
@@ -0,0 +1,88 @@
+import unittest
+
+from keyword_extractor import KeywordExtractor
+
+
+class FakeToken:
+    def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
+        self.text = text
+        self.pos_ = pos
+        self.lemma_ = lemma
+        self.is_stop = is_stop
+        self.is_punct = not any(ch.isalnum() for ch in text)
+
+
+class FakeNLP:
+    def __init__(self, tag_map, stopwords) -> None:
+        self.tag_map = tag_map
+        self.stopwords = stopwords
+
+    def __call__(self, text: str):
+        tokens = []
+        for raw in text.split():
+            token_text = raw.strip()
+            lowered = token_text.lower()
+            tokens.append(
+                FakeToken(
+                    text=token_text,
+                    pos=self.tag_map.get(lowered, "NOUN"),
+                    lemma=lowered,
+                    is_stop=lowered in self.stopwords,
+                )
+            )
+        return tokens
+
+
+class KeywordExtractorTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        tag_map = {
+            "fiery": "ADJ",
+            "dragon": "NOUN",
+            "attack": "VERB",
+            "explosive": "ADJ",
+            "flames": "NOUN",
+            "burning": "ADJ",
+            "creature": "NOUN",
+            "with": "ADP",
+            "blaze": "NOUN",
+            "power": "NOUN",
+            "electric": "ADJ",
+            "mouse": "NOUN",
+            "using": "VERB",
+            "thunder": "NOUN",
+            "shock": "NOUN",
+            "a": "DET",
+            "very": "ADV",
+            "strong": "ADJ",
+            "and": "CCONJ",
+            "dangerous": "ADJ",
+        }
+
+        stopwords = {"a", "very", "and", "with"}
+        cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
+        cls.extractor = KeywordExtractor(nlp=cls.nlp)
+
+    def test_readme_main_example(self) -> None:
+        text = "fiery dragon attack explosive flames"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])
+
+    def test_synonym_normalization(self) -> None:
+        text = "burning creature with blaze power"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["fire", "creature", "power"])
+
+    def test_mixed_types(self) -> None:
+        text = "electric mouse using thunder shock"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["electric", "mouse", "using"])
+
+    def test_noise_input(self) -> None:
+        text = "a very very strong and dangerous creature"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["strong", "dangerous", "creature"])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/clean-text-to-keywords/README.md
+++ b/clean-text-to-keywords/README.md
@@ -0,0 +1,189 @@
+# Pokemon Text-to-JSON Pipeline
+
+This project converts free-form Pokemon description text into:
+
+1. A normalized keyword list
+2. A populated Pokemon JSON object (from a blank/key-only template)
+
+The pipeline is deterministic and rule-based.
+
+## Architecture
+
+### Stage 1: Keyword Extraction
+
+File: `keyword_extractor.py`
+
+Input: raw text description
+
+Core logic:
+
+- spaCy tokenization and POS tagging
+- POS filtering (`NOUN`, `ADJ`, `VERB`)
+- stopword and punctuation removal
+- lemma-based normalization
+- domain synonym normalization (example: `flames -> fire`)
+- optional YAKE relevance scoring
+- conservative retention policy so detail is not over-pruned
+
+Output: ordered list of normalized keywords
+
+### Stage 2: JSON Inference
+
+File: `json_inference.py`
+
+Input: keyword list + optional JSON template
+
+Core logic:
+
+- infer primary/secondary type
+- infer name candidate
+- infer attacks, abilities, habitat, personality
+- infer basic stats (`hp`, `attack`, `defense`, `speed`)
+- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.)
+- preserve already non-empty values in the provided template
+
+Output: inferred JSON profile
+
+### Stage 3: Orchestration CLI
+
+File: `infer_json_usage.py`
+
+This is the main entrypoint for end-to-end usage.
+
+Default behavior:
+
+1. prints extracted keyword list
+2. prints inferred JSON
+
+## Project Structure
+
+- `keyword_extractor.py`: keyword extraction engine
+- `json_inference.py`: keyword-to-JSON inference logic
+- `infer_json_usage.py`: end-to-end CLI
+- `example_usage.py`: keyword extraction only CLI
+- `json_template_example.json`: sample blank/key-only template
+- `test_keyword_extractor.py`: extraction tests
+- `test_json_inference.py`: inference tests
+- `requirements.txt`: Python dependencies
+
+## Requirements
+
+- Python 3.13 or lower is recommended for spaCy compatibility
+- pip
+
+Dependencies in `requirements.txt`:
+
+- `spacy>=3.7.0`
+- `yake>=0.4.2`
+
+## Setup
+
+1. Create and activate a virtual environment (recommended)
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+```
+
+2. Install dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+3. Install spaCy English model
+
+```bash
+python -m spacy download en_core_web_sm
+```
+
+## How To Run
+
+### A) Extract keywords only
+
+```bash
+python example_usage.py "furret long slender agile creature with soft fur"
+```
+
+Output: JSON list of keywords.
+
+### B) End-to-end: text -> keywords -> JSON
+
+```bash
+python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur"
+```
+
+Output order:
+
+1. keyword list
+2. inferred JSON
+
+### C) End-to-end but JSON only
+
+```bash
+python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur"
+```
+
+### D) Start from keywords directly
+
+```bash
+python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance
+```
+
+Tip: If you pass `--keywords`, text extraction is skipped.
+
+## Template Behavior
+
+If `--template` is omitted, inference returns a full inferred profile object.
+
+If `--template` is provided:
+
+- empty fields are populated from inferred values
+- non-empty fields are preserved
+
+Current sample template supports nested card-like data including:
+
+- `types`
+- `attacks` with `cost`, `name`, `effect`, `damage`
+- `weaknesses` with `type`, `value`
+- `stage`, `retreat`, `legal`
+
+## Tests
+
+Run all tests:
+
+```bash
+python -m unittest -q
+```
+
+## Troubleshooting
+
+### 1) spaCy model not found
+
+Error mentions `en_core_web_sm` not installed.
+
+Fix:
+
+```bash
+python -m spacy download en_core_web_sm
+```
+
+### 2) spaCy import/runtime problems on very new Python versions
+
+Use Python 3.13 or lower and reinstall requirements.
+
+### 3) `--template` path errors
+
+Ensure `--template` points to a valid file path, for example:
+
+```bash
+--template json_template_example.json
+```
+
+If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`.
+
+## Design Notes
+
+- deterministic and explainable (no LLM calls)
+- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py`
+- scoring and template fill rules are intentionally simple and stable for game-content generation
--- a/clean-text-to-keywords/pycache/json_inference.cpython-312.pyc
+++ b/clean-text-to-keywords/pycache/json_inference.cpython-312.pyc
--- a/clean-text-to-keywords/pycache/keyword_extractor.cpython-312.pyc
+++ b/clean-text-to-keywords/pycache/keyword_extractor.cpython-312.pyc
--- a/clean-text-to-keywords/pycache/test_json_inference.cpython-312.pyc
+++ b/clean-text-to-keywords/pycache/test_json_inference.cpython-312.pyc
--- a/clean-text-to-keywords/pycache/test_keyword_extractor.cpython-312.pyc
+++ b/clean-text-to-keywords/pycache/test_keyword_extractor.cpython-312.pyc
--- a/clean-text-to-keywords/example_usage.py
+++ b/clean-text-to-keywords/example_usage.py
@@ -0,0 +1,36 @@
+import argparse
+import json
+from typing import Sequence
+
+from keyword_extractor import KeywordExtractor
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Extract normalized keywords from cleaned text.",
+    )
+    parser.add_argument(
+        "text",
+        nargs="+",
+        help="Input text to process. Pass as one quoted string or multiple words.",
+    )
+    parser.add_argument(
+        "--model",
+        default="en_core_web_sm",
+        help="spaCy model name (default: en_core_web_sm).",
+    )
+    return parser
+
+
+def main(argv: Sequence[str] | None = None) -> None:
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+
+    text = " ".join(args.text)
+    extractor = KeywordExtractor.from_default_model(model_name=args.model)
+    keywords = extractor.extract(text)
+    print(json.dumps(keywords))
+
+
+if __name__ == "__main__":
+    main()
--- a/clean-text-to-keywords/infer_json_usage.py
+++ b/clean-text-to-keywords/infer_json_usage.py
@@ -0,0 +1,111 @@
+import argparse
+import json
+import os
+import re
+from typing import Sequence
+
+from keyword_extractor import KeywordExtractor
+from json_inference import fill_template_from_keywords
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Extract keywords and infer values into a JSON template.",
+    )
+    parser.add_argument(
+        "text",
+        nargs="*",
+        help="Input description text.",
+    )
+    parser.add_argument(
+        "--template",
+        default="",
+        help="Path to JSON template file with keys only. If omitted, full inferred JSON is returned.",
+    )
+    parser.add_argument(
+        "--model",
+        default="en_core_web_sm",
+        help="spaCy model name (default: en_core_web_sm).",
+    )
+    parser.add_argument(
+        "--keywords",
+        nargs="+",
+        default=None,
+        help="Provide keywords directly instead of raw text.",
+    )
+    parser.add_argument(
+        "--json-only",
+        action="store_true",
+        help="Print only inferred JSON (skip keyword list output).",
+    )
+    return parser
+
+
+def _load_template(path: str):
+    if not path:
+        return {}
+
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"Template file not found: {path}")
+
+    with open(path, "r", encoding="utf-8") as file_handle:
+        raw = file_handle.read().strip()
+        if not raw:
+            return {}
+        return json.loads(raw)
+
+
+def _parse_keywords_fragment(raw: str):
+    if not raw.strip():
+        return []
+
+    try:
+        parsed = json.loads(raw)
+        if isinstance(parsed, list):
+            return [str(item).strip().lower() for item in parsed if str(item).strip()]
+    except json.JSONDecodeError:
+        pass
+
+    tokens = re.findall(r"[a-zA-Z0-9_-]+", raw.lower())
+    return [token for token in tokens if token]
+
+
+def _extract_keywords(args):
+    if args.keywords:
+        return [word.strip().lower() for word in args.keywords if word.strip()]
+
+    if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["):
+        raw = " ".join([args.template] + args.text)
+        return _parse_keywords_fragment(raw)
+
+    if not args.text:
+        raise ValueError("Provide input text or use --keywords.")
+
+    text = " ".join(args.text)
+    extractor = KeywordExtractor.from_default_model(model_name=args.model)
+    return extractor.extract(text)
+
+
+def main(argv: Sequence[str] | None = None) -> None:
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+
+    keywords = _extract_keywords(args)
+
+    template_path = args.template
+    if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["):
+        template_path = ""
+
+    template = _load_template(template_path)
+    inferred_json = fill_template_from_keywords(template, keywords)
+
+    if args.json_only:
+        print(json.dumps(inferred_json, indent=2))
+        return
+
+    print(json.dumps(keywords))
+    print(json.dumps(inferred_json, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/clean-text-to-keywords/json_inference.py
+++ b/clean-text-to-keywords/json_inference.py
@@ -0,0 +1,398 @@
+"""Infer Pokemon-like JSON values from extracted keywords."""
+
+from __future__ import annotations
+
+from copy import deepcopy
+from typing import Any, Dict, Iterable, List, Mapping, Sequence
+
+POKEMON_TYPES = {
+    "normal",
+    "fire",
+    "water",
+    "grass",
+    "electric",
+    "ice",
+    "fighting",
+    "poison",
+    "ground",
+    "flying",
+    "psychic",
+    "bug",
+    "rock",
+    "ghost",
+    "dragon",
+    "dark",
+    "steel",
+    "fairy",
+}
+
+HABITAT_KEYWORDS = {
+    "forest",
+    "field",
+    "cave",
+    "mountain",
+    "river",
+    "ocean",
+    "sea",
+    "tunnel",
+    "nest",
+    "sky",
+    "desert",
+    "swamp",
+    "volcano",
+}
+
+PERSONALITY_KEYWORDS = {
+    "calm",
+    "gentle",
+    "agile",
+    "playful",
+    "cheerful",
+    "energetic",
+    "curious",
+    "fierce",
+    "brave",
+    "loyal",
+    "timid",
+    "bold",
+}
+
+MOVE_KEYWORDS = {
+    "attack",
+    "smash",
+    "strike",
+    "kick",
+    "punch",
+    "shock",
+    "thunder",
+    "bolt",
+    "blast",
+    "explosion",
+    "freeze",
+    "bite",
+    "claw",
+    "tail",
+    "fight",
+}
+
+ABILITY_KEYWORDS = {
+    "recover",
+    "endurance",
+    "explore",
+    "hide",
+    "wander",
+    "bond",
+    "speed",
+    "power",
+    "energy",
+    "flexible",
+}
+
+STAT_HINTS = {
+    "hp": {"endurance", "recover", "energy", "stamina", "healthy", "vital"},
+    "attack": {"attack", "smash", "strike", "punch", "kick", "claw", "fight", "power"},
+    "defense": {"armor", "shield", "tough", "hard", "resist", "solid"},
+    "speed": {"speed", "swift", "agile", "quick", "fast", "dash"},
+}
+
+KEY_ALIASES = {
+    "name": {"name", "pokemon_name"},
+    "type": {"type", "primary_type", "pokemon_type"},
+    "secondary_type": {"secondary_type", "type2", "secondary"},
+    "attacks": {"attacks", "moves", "skills", "offense"},
+    "abilities": {"abilities", "traits", "passives", "special_abilities"},
+    "habitat": {"habitat", "environment", "region"},
+    "personality": {"personality", "temperament", "nature"},
+    "description": {"description", "flavor_text", "summary", "lore"},
+    "keywords": {"keywords", "tags"},
+    "hp": {"hp", "health", "health_points"},
+    "attack": {"attack", "atk"},
+    "defense": {"defense", "def"},
+    "speed": {"speed", "spd"},
+}
+
+GENERIC_NAME_BLACKLIST = {
+    "black",
+    "white",
+    "yellow",
+    "red",
+    "blue",
+    "green",
+    "purple",
+    "orange",
+    "pink",
+    "gray",
+    "grey",
+    "brown",
+    "fur",
+    "body",
+    "tail",
+    "claw",
+    "storm",
+    "cloud",
+    "enemy",
+    "super",
+    "scary",
+    "giant",
+    "speed",
+}
+
+TYPE_WEAKNESS = {
+    "normal": "fighting",
+    "fire": "water",
+    "water": "electric",
+    "grass": "fire",
+    "electric": "ground",
+    "ice": "fire",
+    "fighting": "psychic",
+    "poison": "ground",
+    "ground": "water",
+    "flying": "electric",
+    "psychic": "dark",
+    "bug": "fire",
+    "rock": "water",
+    "ghost": "dark",
+    "dragon": "fairy",
+    "dark": "fighting",
+    "steel": "fire",
+    "fairy": "steel",
+}
+
+
+def _title_case(value: str) -> str:
+    return " ".join(part.capitalize() for part in value.split())
+
+
+def _is_empty_value(value: Any) -> bool:
+    if value is None:
+        return True
+    if isinstance(value, str):
+        return value.strip() == ""
+    if isinstance(value, (list, dict, tuple, set)):
+        return len(value) == 0
+    return False
+
+
+def _canonical_key(key: str) -> str:
+    lowered = key.lower().strip()
+    for canonical, aliases in KEY_ALIASES.items():
+        if lowered in aliases:
+            return canonical
+    return lowered
+
+
+def _pick_name(keywords: Sequence[str]) -> str:
+    for keyword in keywords:
+        if keyword in POKEMON_TYPES:
+            continue
+        if keyword in HABITAT_KEYWORDS:
+            continue
+        if keyword in MOVE_KEYWORDS:
+            continue
+        if keyword in ABILITY_KEYWORDS:
+            continue
+        if keyword in PERSONALITY_KEYWORDS:
+            continue
+        if keyword in GENERIC_NAME_BLACKLIST:
+            continue
+        if len(keyword) < 4:
+            continue
+        return _title_case(keyword)
+    return "Unknown"
+
+
+def _pick_types(keywords: Sequence[str]) -> List[str]:
+    types: List[str] = []
+    for keyword in keywords:
+        if keyword in POKEMON_TYPES and keyword not in types:
+            types.append(keyword)
+        if len(types) >= 2:
+            break
+    if not types:
+        types.append("normal")
+    return types
+
+
+def _pick_habitat(keywords: Sequence[str]) -> str:
+    habitats = [word for word in keywords if word in HABITAT_KEYWORDS]
+    if not habitats:
+        return "unknown"
+    return habitats[0]
+
+
+def _pick_personality(keywords: Sequence[str]) -> List[str]:
+    result: List[str] = []
+    for keyword in keywords:
+        if keyword in PERSONALITY_KEYWORDS and keyword not in result:
+            result.append(keyword)
+    return result[:3]
+
+
+def _pick_attacks(keywords: Sequence[str]) -> List[str]:
+    attacks: List[str] = []
+    for keyword in keywords:
+        if keyword in MOVE_KEYWORDS and keyword not in attacks:
+            attacks.append(keyword)
+    return attacks[:4]
+
+
+def _pick_abilities(keywords: Sequence[str]) -> List[str]:
+    abilities: List[str] = []
+    for keyword in keywords:
+        if keyword in ABILITY_KEYWORDS and keyword not in abilities:
+            abilities.append(keyword)
+    return abilities[:4]
+
+
+def _score_stat(base: int, keywords: Sequence[str], hints: Iterable[str]) -> int:
+    hint_set = set(hints)
+    matches = sum(1 for keyword in keywords if keyword in hint_set)
+    # Each match adds 10 points; keep stats in [40, 160].
+    return max(40, min(160, base + (matches * 10)))
+
+
+def _build_description(name: str, primary_type: str, attacks: Sequence[str], abilities: Sequence[str], habitat: str) -> str:
+    attack_text = ", ".join(attacks) if attacks else "basic combat"
+    ability_text = ", ".join(abilities) if abilities else "balanced adaptation"
+    return (
+        f"{name} is a {primary_type}-type Pokemon often found in {habitat}. "
+        f"It commonly uses {attack_text} and shows abilities like {ability_text}."
+    )
+
+
+def _retreat_cost_from_speed(speed: int) -> int:
+    if speed >= 120:
+        return 0
+    if speed >= 90:
+        return 1
+    if speed >= 70:
+        return 2
+    return 3
+
+
+def _attack_damage_from_attack_stat(attack_stat: int, index: int) -> int:
+    # Keep card damage in simple 10-step increments.
+    base = 30 + max(0, attack_stat - 70) // 2
+    adjusted = base + (index * 10)
+    return max(10, min(160, (adjusted // 10) * 10))
+
+
+def _energy_name_for_type(pokemon_type: str) -> str:
+    if pokemon_type == "normal":
+        return "Colorless"
+    return _title_case(pokemon_type)
+
+
+def _fill_tcg_like_template(output: Dict[str, Any], inferred: Mapping[str, Any]) -> None:
+    if "name" in output and _is_empty_value(output.get("name")):
+        output["name"] = inferred["name"]
+
+    if "description" in output and _is_empty_value(output.get("description")):
+        output["description"] = inferred["description"]
+
+    if "hp" in output and _is_empty_value(output.get("hp")):
+        hp_value = inferred["hp"]
+        output["hp"] = str(hp_value) if isinstance(output.get("hp"), str) else hp_value
+
+    if "types" in output and isinstance(output.get("types"), list):
+        types_value = output["types"]
+        if len(types_value) == 0 or all(_is_empty_value(item) for item in types_value):
+            inferred_types = [inferred["type"]]
+            if inferred.get("secondary_type"):
+                inferred_types.append(inferred["secondary_type"])
+            output["types"] = inferred_types
+
+    if "stage" in output and _is_empty_value(output.get("stage")):
+        output["stage"] = "Basic"
+
+    if "retreat" in output and (output.get("retreat") in (None, 0, "")):
+        output["retreat"] = _retreat_cost_from_speed(int(inferred["speed"]))
+
+    if "weaknesses" in output and isinstance(output.get("weaknesses"), list):
+        weaknesses = output["weaknesses"]
+        if weaknesses:
+            weakness_type = TYPE_WEAKNESS.get(inferred["type"], "fighting")
+            first = weaknesses[0]
+            if isinstance(first, dict):
+                if _is_empty_value(first.get("type")):
+                    first["type"] = weakness_type
+                if _is_empty_value(first.get("value")):
+                    first["value"] = "x2"
+
+    if "attacks" in output and isinstance(output.get("attacks"), list):
+        attack_entries = output["attacks"]
+        inferred_attacks = inferred["attacks"]
+        inferred_type = inferred["type"]
+        for idx, attack_entry in enumerate(attack_entries):
+            if not isinstance(attack_entry, dict):
+                continue
+
+            attack_name = inferred_attacks[idx] if idx < len(inferred_attacks) else "tackle"
+            attack_title = _title_case(attack_name)
+            if _is_empty_value(attack_entry.get("name")):
+                attack_entry["name"] = attack_title
+            if _is_empty_value(attack_entry.get("effect")):
+                attack_entry["effect"] = f"Deals damage with {attack_name}."
+
+            if "damage" in attack_entry and (attack_entry.get("damage") in (None, 0, "")):
+                attack_entry["damage"] = _attack_damage_from_attack_stat(int(inferred["attack"]), idx)
+
+            if "cost" in attack_entry and isinstance(attack_entry.get("cost"), list):
+                current_cost = attack_entry["cost"]
+                if len(current_cost) == 0 or all(_is_empty_value(item) for item in current_cost):
+                    attack_entry["cost"] = [_energy_name_for_type(inferred_type)]
+
+
+def infer_profile_from_keywords(keywords: Sequence[str]) -> Dict[str, Any]:
+    cleaned = [k.strip().lower() for k in keywords if k and k.strip()]
+
+    name = _pick_name(cleaned)
+    types = _pick_types(cleaned)
+    attacks = _pick_attacks(cleaned)
+    abilities = _pick_abilities(cleaned)
+    habitat = _pick_habitat(cleaned)
+    personality = _pick_personality(cleaned)
+
+    hp = _score_stat(70, cleaned, STAT_HINTS["hp"])
+    attack = _score_stat(70, cleaned, STAT_HINTS["attack"])
+    defense = _score_stat(70, cleaned, STAT_HINTS["defense"])
+    speed = _score_stat(70, cleaned, STAT_HINTS["speed"])
+
+    return {
+        "name": name,
+        "type": types[0],
+        "secondary_type": types[1] if len(types) > 1 else None,
+        "attacks": attacks,
+        "abilities": abilities,
+        "habitat": habitat,
+        "personality": personality,
+        "hp": hp,
+        "attack": attack,
+        "defense": defense,
+        "speed": speed,
+        "keywords": cleaned,
+        "description": _build_description(name, types[0], attacks, abilities, habitat),
+    }
+
+
+def fill_template_from_keywords(template: Mapping[str, Any], keywords: Sequence[str]) -> Dict[str, Any]:
+    """Fill a key-only template by inferring values from keywords.
+
+    Existing non-empty values in template are preserved.
+    """
+    inferred = infer_profile_from_keywords(keywords)
+    output: Dict[str, Any] = deepcopy(dict(template))
+
+    if not output:
+        return inferred
+
+    _fill_tcg_like_template(output, inferred)
+
+    for key, current_value in output.items():
+        canonical = _canonical_key(key)
+        if canonical not in inferred:
+            continue
+        if _is_empty_value(current_value):
+            output[key] = inferred[canonical]
+
+    return output
--- a/clean-text-to-keywords/json_template_example.json
+++ b/clean-text-to-keywords/json_template_example.json
@@ -0,0 +1,35 @@
+{
+  "category": "Pokemon",
+  "name": "",
+  "rarity": "",
+  "hp": "",
+  "types": [""],
+  "evolveFrom": "",
+  "description": "",
+  "stage": "",
+  "attacks": [
+    {
+      "cost": [""],
+      "name": "",
+      "effect": ""
+    },
+    {
+      "cost": [""],
+      "name": "",
+      "effect": "",
+      "damage": 0
+    }
+  ],
+  "weaknesses": [
+    {
+      "type": "",
+      "value": ""
+    }
+  ],
+  "retreat": 0,
+  "regulationMark": "",
+  "legal": {
+    "standard": true,
+    "expanded": true
+  }
+}
--- a/clean-text-to-keywords/keyword_extractor.py
+++ b/clean-text-to-keywords/keyword_extractor.py
@@ -0,0 +1,248 @@
+"""Rule-based keyword extraction and normalization for Pokemon card generation."""
+
+from __future__ import annotations
+
+import math
+import re
+from dataclasses import dataclass, field
+from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple
+
+DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = {
+    "normal": ["basic", "common", "regular", "plain", "normaltype"],
+    "fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"],
+    "water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"],
+    "grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"],
+    "flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"],
+    "fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"],
+    "poison": ["toxic", "venom", "acid", "poisonous", "toxin"],
+    "electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"],
+    "ground": ["earth", "soil", "sand", "mud", "quake", "dust"],
+    "rock": ["stone", "boulder", "crystal", "rocky", "pebble"],
+    "psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"],
+    "ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"],
+    "bug": ["insect", "ant", "beetle", "spider", "crawler"],
+    "ghost": ["spirit", "phantom", "haunt", "shadow", "specter"],
+    "steel": ["metal", "iron", "armor", "blade", "alloy"],
+    "dragon": ["drake", "wyrm", "serpent", "legendary"],
+    "dark": ["shadow", "evil", "night", "doom", "darkness"],
+    "fairy": ["magic", "magical", "sparkle", "light", "charm"],
+    "explosion": ["explosive", "explode", "blast"],
+}
+
+DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB")
+DEFAULT_IGNORED_KEYWORDS: Set[str] = {"preevolution", "pokmon"}
+DEFAULT_POS_WEIGHTS: Dict[str, float] = {
+    "NOUN": 3.0,
+    "ADJ": 2.0,
+    "VERB": 1.0,
+}
+DEFAULT_KEEP_RATIO = 0.8
+DEFAULT_MIN_KEYWORDS = 12
+DEFAULT_MAX_KEYWORDS = 30
+
+
+def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]:
+    """Build synonym -> canonical mapping for O(1) normalization lookup."""
+    inverse: Dict[str, str] = {}
+    for canonical, synonyms in normalization_map.items():
+        canonical_normalized = canonical.strip().lower()
+        inverse[canonical_normalized] = canonical_normalized
+        for synonym in synonyms:
+            synonym_normalized = synonym.strip().lower()
+            if synonym_normalized:
+                inverse[synonym_normalized] = canonical_normalized
+    return inverse
+
+
+def _tokenize_keyword_phrase(value: str) -> List[str]:
+    return re.findall(r"[a-z0-9]+", value.lower())
+
+
+@dataclass
+class KeywordExtractor:
+    """Deterministic spaCy + YAKE + rule-based normalization pipeline."""
+
+    nlp: Any
+    normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP)
+    allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS)
+    ignored_keywords: Set[str] = field(default_factory=lambda: set(DEFAULT_IGNORED_KEYWORDS))
+    pos_weights: Mapping[str, float] = field(default_factory=lambda: DEFAULT_POS_WEIGHTS)
+    keep_ratio: float = DEFAULT_KEEP_RATIO
+    min_keywords: int = DEFAULT_MIN_KEYWORDS
+    max_keywords: int = DEFAULT_MAX_KEYWORDS
+    use_yake: bool = True
+
+    def __post_init__(self) -> None:
+        self._normalization_lookup = _invert_normalization_map(self.normalization_map)
+        self._allowed_pos_set = set(self.allowed_pos)
+        self._ignored_keywords = {keyword.lower().strip() for keyword in self.ignored_keywords}
+        self._pos_weight_lookup = {k.upper(): float(v) for k, v in self.pos_weights.items()}
+
+    @classmethod
+    def from_default_model(
+        cls,
+        model_name: str = "en_core_web_sm",
+        normalization_map: Optional[Mapping[str, Iterable[str]]] = None,
+        allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS,
+        ignored_keywords: Optional[Set[str]] = None,
+        pos_weights: Mapping[str, float] = DEFAULT_POS_WEIGHTS,
+        keep_ratio: float = DEFAULT_KEEP_RATIO,
+        min_keywords: int = DEFAULT_MIN_KEYWORDS,
+        max_keywords: int = DEFAULT_MAX_KEYWORDS,
+        use_yake: bool = True,
+    ) -> "KeywordExtractor":
+        """Initialize extractor with a spaCy English pipeline."""
+        try:
+            import spacy
+
+            nlp = spacy.load(model_name)
+        except OSError as exc:
+            raise OSError(
+                f"spaCy model '{model_name}' is not installed. "
+                "Run: python -m spacy download en_core_web_sm"
+            ) from exc
+        except Exception as exc:
+            raise RuntimeError(
+                "spaCy could not be loaded in this Python environment. "
+                "Try Python 3.13 or lower, then install spaCy and en_core_web_sm."
+            ) from exc
+
+        return cls(
+            nlp=nlp,
+            normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP,
+            allowed_pos=allowed_pos,
+            ignored_keywords=ignored_keywords or set(DEFAULT_IGNORED_KEYWORDS),
+            pos_weights=pos_weights,
+            keep_ratio=keep_ratio,
+            min_keywords=min_keywords,
+            max_keywords=max_keywords,
+            use_yake=use_yake,
+        )
+
+    def extract(self, text: str) -> List[str]:
+        """Extract, normalize and rank keywords from already-cleaned text."""
+        if not text or not text.strip():
+            return []
+
+        doc = self.nlp(text)
+
+        # Step 1: POS filtering + lowercase lemma/token extraction.
+        raw_keywords: List[Tuple[str, str]] = []
+        for token in doc:
+            if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set:
+                continue
+
+            base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip()
+            if base and base not in self._ignored_keywords:
+                raw_keywords.append((base, token.pos_))
+
+        # Step 2: Deduplicate before domain normalization.
+        deduplicated: List[Tuple[str, str]] = []
+        seen_raw: Set[str] = set()
+        for keyword, pos in raw_keywords:
+            if keyword in seen_raw:
+                continue
+            seen_raw.add(keyword)
+            deduplicated.append((keyword, pos))
+
+        # Step 3: Normalize and deduplicate canonical forms.
+        unique_entries: List[Tuple[str, str, str, int]] = []
+        seen_normalized: Set[str] = set()
+        for index, (original_keyword, pos) in enumerate(deduplicated):
+            normalized_keyword = self._normalize_keyword(original_keyword)
+            if normalized_keyword in seen_normalized:
+                continue
+            seen_normalized.add(normalized_keyword)
+            unique_entries.append((original_keyword, normalized_keyword, pos, index))
+
+        if not unique_entries:
+            return []
+
+        if not self.use_yake:
+            return [normalized_keyword for _, normalized_keyword, _, _ in unique_entries]
+
+        # Step 4: YAKE scoring + conservative selection to preserve detail.
+        yake_scores = self._extract_yake_scores(text)
+        if not yake_scores:
+            return [normalized_keyword for _, normalized_keyword, _, _ in unique_entries]
+
+        ranked: List[Tuple[float, int, str]] = []
+        for original_keyword, normalized_keyword, pos, index in unique_entries:
+            score_candidates: List[float] = []
+            if original_keyword in yake_scores:
+                score_candidates.append(yake_scores[original_keyword])
+            if normalized_keyword in yake_scores:
+                score_candidates.append(yake_scores[normalized_keyword])
+
+            # Missing score is treated as moderately relevant to avoid over-pruning.
+            yake_penalty = min(score_candidates) if score_candidates else 0.45
+            pos_weight = self._pos_weight_lookup.get(pos.upper(), 1.0)
+            combined_score = (1.0 - yake_penalty) * pos_weight
+            ranked.append((combined_score, index, normalized_keyword))
+
+        target_count = self._compute_target_count(len(ranked))
+        ranked.sort(key=lambda item: (-item[0], item[1]))
+        selected = ranked[:target_count]
+        selected.sort(key=lambda item: item[1])
+
+        return [keyword for _, _, keyword in selected]
+
+    def _compute_target_count(self, total_keywords: int) -> int:
+        if total_keywords <= 0:
+            return 0
+
+        target = max(self.min_keywords, math.ceil(total_keywords * self.keep_ratio))
+        if self.max_keywords > 0:
+            target = min(target, self.max_keywords)
+        return min(target, total_keywords)
+
+    def _extract_yake_scores(self, text: str) -> Dict[str, float]:
+        try:
+            import yake
+        except Exception:
+            return {}
+
+        text_token_count = len(text.split())
+        top_n = max(20, min(80, text_token_count * 2))
+
+        try:
+            extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, top=top_n)
+            phrase_scores = extractor.extract_keywords(text)
+        except Exception:
+            return {}
+
+        token_scores: Dict[str, float] = {}
+        for phrase, score in phrase_scores:
+            for token in _tokenize_keyword_phrase(phrase):
+                existing = token_scores.get(token)
+                if existing is None or score < existing:
+                    token_scores[token] = score
+
+        if not token_scores:
+            return {}
+
+        values = list(token_scores.values())
+        min_score = min(values)
+        max_score = max(values)
+
+        if math.isclose(min_score, max_score):
+            return {token: 0.5 for token in token_scores}
+
+        # Normalize so 0.0=most important and 1.0=least important.
+        return {
+            token: (score - min_score) / (max_score - min_score)
+            for token, score in token_scores.items()
+        }
+
+    def _normalize_keyword(self, keyword: str) -> str:
+        keyword_lower = keyword.lower()
+        return self._normalization_lookup.get(keyword_lower, keyword_lower)
+
+
+def extract_keywords(
+    text: str,
+    extractor: Optional[KeywordExtractor] = None,
+) -> List[str]:
+    """Convenience API to extract keywords with default extractor config."""
+    active_extractor = extractor or KeywordExtractor.from_default_model()
+    return active_extractor.extract(text)
--- a/clean-text-to-keywords/requirements.txt
+++ b/clean-text-to-keywords/requirements.txt
@@ -0,0 +1,2 @@
+spacy>=3.7.0
+yake>=0.4.2
--- a/clean-text-to-keywords/test_json_inference.py
+++ b/clean-text-to-keywords/test_json_inference.py
@@ -0,0 +1,143 @@
+import unittest
+
+from json_inference import fill_template_from_keywords, infer_profile_from_keywords
+
+
+class JsonInferenceTests(unittest.TestCase):
+    def test_profile_inference_basics(self) -> None:
+        keywords = [
+            "zapthorn",
+            "electric",
+            "wolf",
+            "thunder",
+            "claw",
+            "speed",
+            "storm",
+            "agile",
+            "forest",
+            "recover",
+            "energy",
+        ]
+
+        profile = infer_profile_from_keywords(keywords)
+
+        self.assertEqual(profile["name"], "Zapthorn")
+        self.assertEqual(profile["type"], "electric")
+        self.assertIn("thunder", profile["attacks"])
+        self.assertIn("claw", profile["attacks"])
+        self.assertIn("recover", profile["abilities"])
+        self.assertEqual(profile["habitat"], "forest")
+        self.assertGreaterEqual(profile["speed"], 80)
+
+    def test_fill_key_only_template(self) -> None:
+        template = {
+            "name": "",
+            "type": "",
+            "secondary_type": None,
+            "attacks": [],
+            "abilities": [],
+            "habitat": "",
+            "personality": [],
+            "hp": None,
+            "attack": None,
+            "defense": None,
+            "speed": None,
+            "description": "",
+            "keywords": [],
+        }
+
+        keywords = [
+            "furret",
+            "normal",
+            "tail",
+            "smash",
+            "tunnel",
+            "agile",
+            "cheerful",
+            "explore",
+            "endurance",
+        ]
+
+        result = fill_template_from_keywords(template, keywords)
+
+        self.assertEqual(result["name"], "Furret")
+        self.assertEqual(result["type"], "normal")
+        self.assertIn("smash", result["attacks"])
+        self.assertIn("explore", result["abilities"])
+        self.assertEqual(result["habitat"], "tunnel")
+        self.assertIn("cheerful", result["personality"])
+        self.assertIsInstance(result["description"], str)
+        self.assertGreater(len(result["description"]), 20)
+
+    def test_fill_tcg_style_template(self) -> None:
+        template = {
+            "category": "Pokemon",
+            "name": "",
+            "hp": "",
+            "types": [""],
+            "description": "",
+            "stage": "",
+            "attacks": [
+                {"cost": [""], "name": "", "effect": ""},
+                {"cost": [""], "name": "", "effect": "", "damage": 0},
+            ],
+            "weaknesses": [{"type": "", "value": ""}],
+            "retreat": 0,
+        }
+
+        keywords = [
+            "zapthorn",
+            "electric",
+            "thunder",
+            "claw",
+            "speed",
+            "storm",
+            "energy",
+        ]
+
+        result = fill_template_from_keywords(template, keywords)
+
+        self.assertEqual(result["name"], "Zapthorn")
+        self.assertEqual(result["types"], ["electric"])
+        self.assertEqual(result["stage"], "Basic")
+        self.assertTrue(result["hp"].isdigit())
+        self.assertEqual(result["weaknesses"][0]["type"], "ground")
+        self.assertEqual(result["weaknesses"][0]["value"], "x2")
+        self.assertEqual(result["attacks"][0]["name"], "Thunder")
+        self.assertEqual(result["attacks"][1]["name"], "Claw")
+        self.assertEqual(result["attacks"][0]["cost"], ["Electric"])
+        self.assertGreaterEqual(result["retreat"], 0)
+
+    def test_name_fallback_to_unknown_for_generic_tokens(self) -> None:
+        keywords = [
+            "black",
+            "fur",
+            "giant",
+            "electric",
+            "claw",
+            "speed",
+            "storm",
+        ]
+
+        profile = infer_profile_from_keywords(keywords)
+        self.assertEqual(profile["name"], "Unknown")
+
+    def test_preserves_existing_values(self) -> None:
+        template = {
+            "name": "CustomName",
+            "type": "electric",
+            "attacks": [],
+            "description": "Already set",
+        }
+        keywords = ["furret", "normal", "attack"]
+
+        result = fill_template_from_keywords(template, keywords)
+
+        self.assertEqual(result["name"], "CustomName")
+        self.assertEqual(result["type"], "electric")
+        self.assertEqual(result["description"], "Already set")
+        self.assertIn("attack", result["attacks"])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/clean-text-to-keywords/test_keyword_extractor.py
+++ b/clean-text-to-keywords/test_keyword_extractor.py
@@ -0,0 +1,166 @@
+import unittest
+
+from keyword_extractor import KeywordExtractor
+
+
+class FakeToken:
+    def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
+        self.text = text
+        self.pos_ = pos
+        self.lemma_ = lemma
+        self.is_stop = is_stop
+        self.is_punct = not any(ch.isalnum() for ch in text)
+
+
+class FakeNLP:
+    def __init__(self, tag_map, stopwords) -> None:
+        self.tag_map = tag_map
+        self.stopwords = stopwords
+
+    def __call__(self, text: str):
+        tokens = []
+        for raw in text.split():
+            token_text = raw.strip()
+            lowered = token_text.lower()
+            tokens.append(
+                FakeToken(
+                    text=token_text,
+                    pos=self.tag_map.get(lowered, "NOUN"),
+                    lemma=lowered,
+                    is_stop=lowered in self.stopwords,
+                )
+            )
+        return tokens
+
+
+class TestableKeywordExtractor(KeywordExtractor):
+    def __init__(self, *args, yake_scores=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_yake_scores = yake_scores or {}
+
+    def _extract_yake_scores(self, text: str):
+        return self._test_yake_scores
+
+
+class KeywordExtractorTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        tag_map = {
+            "fiery": "ADJ",
+            "dragon": "NOUN",
+            "attack": "VERB",
+            "explosive": "ADJ",
+            "flames": "NOUN",
+            "burning": "ADJ",
+            "creature": "NOUN",
+            "with": "ADP",
+            "blaze": "NOUN",
+            "and": "CCONJ",
+            "dangerous": "ADJ",
+            "electric": "ADJ",
+            "mouse": "NOUN",
+            "using": "VERB",
+            "thunder": "NOUN",
+            "shock": "NOUN",
+            "strong": "ADJ",
+            "furret": "NOUN",
+            "long": "ADJ",
+            "slender": "ADJ",
+            "soft": "ADJ",
+            "fur": "NOUN",
+            "flexible": "ADJ",
+            "body": "NOUN",
+            "move": "VERB",
+            "gracefully": "ADJ",
+            "narrow": "ADJ",
+            "tunnel": "NOUN",
+            "tail": "NOUN",
+            "smash": "VERB",
+            "opponent": "NOUN",
+            "battle": "NOUN",
+            "cheerful": "ADJ",
+            "endurance": "NOUN",
+        }
+
+        stopwords = {
+            "a",
+            "very",
+            "and",
+            "with",
+            "the",
+            "it",
+            "to",
+            "its",
+            "that",
+            "through",
+            "in",
+        }
+        cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
+        cls.extractor = KeywordExtractor(nlp=cls.nlp, use_yake=False)
+
+    def test_readme_main_example(self) -> None:
+        text = "fiery dragon attack explosive flames"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])
+
+    def test_synonym_normalization(self) -> None:
+        text = "burning creature with blaze power"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["fire", "creature", "power"])
+
+    def test_mixed_types(self) -> None:
+        text = "electric mouse using thunder shock"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["electric", "mouse", "using"])
+
+    def test_noise_input(self) -> None:
+        text = "a very very strong and dangerous creature"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["strong", "dangerous", "creature"])
+
+    def test_yake_keeps_detailed_information(self) -> None:
+        text = (
+            "furret long slender creature soft fur flexible body move gracefully narrow tunnel "
+            "tail smash opponent battle cheerful endurance"
+        )
+
+        yake_scores = {
+            "furret": 0.00,
+            "creature": 0.05,
+            "tail": 0.08,
+            "battle": 0.10,
+            "smash": 0.12,
+            "tunnel": 0.14,
+            "endurance": 0.18,
+            "body": 0.20,
+            "cheerful": 0.22,
+            "slender": 0.26,
+            "flexible": 0.28,
+            "gracefully": 0.34,
+            "narrow": 0.40,
+            "long": 0.42,
+            "soft": 0.44,
+            "fur": 0.45,
+            "move": 0.48,
+            "opponent": 0.52,
+        }
+        extractor = TestableKeywordExtractor(
+            nlp=self.nlp,
+            use_yake=True,
+            keep_ratio=0.8,
+            min_keywords=10,
+            max_keywords=30,
+            yake_scores=yake_scores,
+        )
+
+        result = extractor.extract(text)
+
+        self.assertGreaterEqual(len(result), 10)
+        self.assertIn("furret", result)
+        self.assertIn("creature", result)
+        self.assertIn("tail", result)
+        self.assertIn("tunnel", result)
+
+
+if __name__ == "__main__":
+    unittest.main()