first commit

2026-03-19 18:16:20 +01:00
commit 584b2e07b4
34 changed files with 4381 additions and 0 deletions
--- a/clean-text-to-keywords/.ipynb_checkpoints/README-checkpoint.md
+++ b/clean-text-to-keywords/.ipynb_checkpoints/README-checkpoint.md
@@ -0,0 +1,189 @@
+# Pokemon Text-to-JSON Pipeline
+
+This project converts free-form Pokemon description text into:
+
+1. A normalized keyword list
+2. A populated Pokemon JSON object (from a blank/key-only template)
+
+The pipeline is deterministic and rule-based.
+
+## Architecture
+
+### Stage 1: Keyword Extraction
+
+File: `keyword_extractor.py`
+
+Input: raw text description
+
+Core logic:
+
+- spaCy tokenization and POS tagging
+- POS filtering (`NOUN`, `ADJ`, `VERB`)
+- stopword and punctuation removal
+- lemma-based normalization
+- domain synonym normalization (example: `flames -> fire`)
+- optional YAKE relevance scoring
+- conservative retention policy so detail is not over-pruned
+
+Output: ordered list of normalized keywords
+
+### Stage 2: JSON Inference
+
+File: `json_inference.py`
+
+Input: keyword list + optional JSON template
+
+Core logic:
+
+- infer primary/secondary type
+- infer name candidate
+- infer attacks, abilities, habitat, personality
+- infer basic stats (`hp`, `attack`, `defense`, `speed`)
+- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.)
+- preserve already non-empty values in the provided template
+
+Output: inferred JSON profile
+
+### Stage 3: Orchestration CLI
+
+File: `infer_json_usage.py`
+
+This is the main entrypoint for end-to-end usage.
+
+Default behavior:
+
+1. prints extracted keyword list
+2. prints inferred JSON
+
+## Project Structure
+
+- `keyword_extractor.py`: keyword extraction engine
+- `json_inference.py`: keyword-to-JSON inference logic
+- `infer_json_usage.py`: end-to-end CLI
+- `example_usage.py`: keyword extraction only CLI
+- `json_template_example.json`: sample blank/key-only template
+- `test_keyword_extractor.py`: extraction tests
+- `test_json_inference.py`: inference tests
+- `requirements.txt`: Python dependencies
+
+## Requirements
+
+- Python 3.13 or lower is recommended for spaCy compatibility
+- pip
+
+Dependencies in `requirements.txt`:
+
+- `spacy>=3.7.0`
+- `yake>=0.4.2`
+
+## Setup
+
+1. Create and activate a virtual environment (recommended)
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+```
+
+2. Install dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+3. Install spaCy English model
+
+```bash
+python -m spacy download en_core_web_sm
+```
+
+## How To Run
+
+### A) Extract keywords only
+
+```bash
+python example_usage.py "furret long slender agile creature with soft fur"
+```
+
+Output: JSON list of keywords.
+
+### B) End-to-end: text -> keywords -> JSON
+
+```bash
+python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur"
+```
+
+Output order:
+
+1. keyword list
+2. inferred JSON
+
+### C) End-to-end but JSON only
+
+```bash
+python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur"
+```
+
+### D) Start from keywords directly
+
+```bash
+python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance
+```
+
+Tip: If you pass `--keywords`, text extraction is skipped.
+
+## Template Behavior
+
+If `--template` is omitted, inference returns a full inferred profile object.
+
+If `--template` is provided:
+
+- empty fields are populated from inferred values
+- non-empty fields are preserved
+
+Current sample template supports nested card-like data including:
+
+- `types`
+- `attacks` with `cost`, `name`, `effect`, `damage`
+- `weaknesses` with `type`, `value`
+- `stage`, `retreat`, `legal`
+
+## Tests
+
+Run all tests:
+
+```bash
+python -m unittest -q
+```
+
+## Troubleshooting
+
+### 1) spaCy model not found
+
+Error mentions `en_core_web_sm` not installed.
+
+Fix:
+
+```bash
+python -m spacy download en_core_web_sm
+```
+
+### 2) spaCy import/runtime problems on very new Python versions
+
+Use Python 3.13 or lower and reinstall requirements.
+
+### 3) `--template` path errors
+
+Ensure `--template` points to a valid file path, for example:
+
+```bash
+--template json_template_example.json
+```
+
+If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`.
+
+## Design Notes
+
+- deterministic and explainable (no LLM calls)
+- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py`
+- scoring and template fill rules are intentionally simple and stable for game-content generation
--- a/clean-text-to-keywords/.ipynb_checkpoints/example_usage-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/example_usage-checkpoint.py
@@ -0,0 +1,36 @@
+import argparse
+import json
+from typing import Sequence
+
+from keyword_extractor import KeywordExtractor
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Extract normalized keywords from cleaned text.",
+    )
+    parser.add_argument(
+        "text",
+        nargs="+",
+        help="Input text to process. Pass as one quoted string or multiple words.",
+    )
+    parser.add_argument(
+        "--model",
+        default="en_core_web_sm",
+        help="spaCy model name (default: en_core_web_sm).",
+    )
+    return parser
+
+
+def main(argv: Sequence[str] | None = None) -> None:
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+
+    text = " ".join(args.text)
+    extractor = KeywordExtractor.from_default_model(model_name=args.model)
+    keywords = extractor.extract(text)
+    print(json.dumps(keywords))
+
+
+if __name__ == "__main__":
+    main()
--- a/clean-text-to-keywords/.ipynb_checkpoints/infer_json_usage-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/infer_json_usage-checkpoint.py
--- a/clean-text-to-keywords/.ipynb_checkpoints/json_inference-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/json_inference-checkpoint.py
--- a/clean-text-to-keywords/.ipynb_checkpoints/json_template_example-checkpoint.json
+++ b/clean-text-to-keywords/.ipynb_checkpoints/json_template_example-checkpoint.json
--- a/clean-text-to-keywords/.ipynb_checkpoints/keyword_extractor-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/keyword_extractor-checkpoint.py
@@ -0,0 +1,137 @@
+"""Rule-based keyword extraction and normalization for Pokemon card generation."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple
+
+# Canonical concept -> synonym list
+from typing import Dict, List
+
+DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = {
+    "normal": ["basic", "common", "regular", "plain"],
+    "fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"],
+    "water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"],
+    "grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"],
+    "flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"],
+    "fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"],
+    "poison": ["toxic", "venom", "acid", "poisonous", "toxin"],
+    "electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"],
+    "ground": ["earth", "soil", "sand", "mud", "quake", "dust"],
+    "rock": ["stone", "boulder", "crystal", "rocky", "pebble"],
+    "psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"],
+    "ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"],
+    "bug": ["insect", "ant", "beetle", "spider", "crawler"],
+    "ghost": ["spirit", "phantom", "haunt", "shadow", "specter"],
+    "steel": ["metal", "iron", "armor", "blade", "alloy"],
+    "dragon": ["drake", "wyrm", "serpent", "legendary"],
+    "dark": ["shadow", "evil", "night", "doom", "darkness"],
+    "fairy": ["magic", "magical", "sparkle", "light", "charm"],
+}
+
+DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB")
+
+
+def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]:
+    """Build synonym -> canonical mapping for O(1) normalization lookup."""
+    inverse: Dict[str, str] = {}
+    for canonical, synonyms in normalization_map.items():
+        canonical_normalized = canonical.strip().lower()
+        inverse[canonical_normalized] = canonical_normalized
+        for synonym in synonyms:
+            synonym_normalized = synonym.strip().lower()
+            if synonym_normalized:
+                inverse[synonym_normalized] = canonical_normalized
+    return inverse
+
+
+def _deduplicate_preserve_order(items: Iterable[str]) -> List[str]:
+    seen: Set[str] = set()
+    output: List[str] = []
+    for item in items:
+        if item not in seen:
+            seen.add(item)
+            output.append(item)
+    return output
+
+
+@dataclass
+class KeywordExtractor:
+    """Deterministic spaCy + rule-based keyword extraction pipeline."""
+
+    nlp: Any
+    normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP)
+    allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS)
+
+    def __post_init__(self) -> None:
+        self._normalization_lookup = _invert_normalization_map(self.normalization_map)
+        self._allowed_pos_set = set(self.allowed_pos)
+
+    @classmethod
+    def from_default_model(
+        cls,
+        model_name: str = "en_core_web_sm",
+        normalization_map: Optional[Mapping[str, Iterable[str]]] = None,
+        allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS,
+    ) -> "KeywordExtractor":
+        """Initialize extractor with a spaCy English pipeline."""
+        try:
+            import spacy
+
+            nlp = spacy.load(model_name)
+        except OSError as exc:
+            raise OSError(
+                f"spaCy model '{model_name}' is not installed. "
+                "Run: python -m spacy download en_core_web_sm"
+            ) from exc
+        except Exception as exc:
+            raise RuntimeError(
+                "spaCy could not be loaded in this Python environment. "
+                "Try Python 3.13 or lower, then install spaCy and en_core_web_sm."
+            ) from exc
+
+        return cls(
+            nlp=nlp,
+            normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP,
+            allowed_pos=allowed_pos,
+        )
+
+    def extract(self, text: str) -> List[str]:
+        """Extract and normalize keywords from already-cleaned text."""
+        if not text or not text.strip():
+            return []
+
+        doc = self.nlp(text)
+
+        # Step 1: POS filtering + base normalization to lowercase lemmas/tokens.
+        raw_keywords: List[str] = []
+        for token in doc:
+            if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set:
+                continue
+
+            # Use lemma where possible to collapse inflections.
+            base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip()
+            if base:
+                raw_keywords.append(base)
+
+        # Step 2: Deduplicate before domain normalization (as requested in README).
+        deduplicated = _deduplicate_preserve_order(raw_keywords)
+
+        # Step 3: Map variants/synonyms to canonical concepts.
+        normalized = [self._normalize_keyword(keyword) for keyword in deduplicated]
+
+        # Step 4: Deduplicate again, since multiple words can map to one concept.
+        return _deduplicate_preserve_order(normalized)
+
+    def _normalize_keyword(self, keyword: str) -> str:
+        keyword_lower = keyword.lower()
+        return self._normalization_lookup.get(keyword_lower, keyword_lower)
+
+
+def extract_keywords(
+    text: str,
+    extractor: Optional[KeywordExtractor] = None,
+) -> List[str]:
+    """Convenience API to extract keywords with default extractor config."""
+    active_extractor = extractor or KeywordExtractor.from_default_model()
+    return active_extractor.extract(text)
--- a/clean-text-to-keywords/.ipynb_checkpoints/requirements-checkpoint.txt
+++ b/clean-text-to-keywords/.ipynb_checkpoints/requirements-checkpoint.txt
--- a/clean-text-to-keywords/.ipynb_checkpoints/test_json_inference-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/test_json_inference-checkpoint.py
--- a/clean-text-to-keywords/.ipynb_checkpoints/test_keyword_extractor-checkpoint.py
+++ b/clean-text-to-keywords/.ipynb_checkpoints/test_keyword_extractor-checkpoint.py
@@ -0,0 +1,88 @@
+import unittest
+
+from keyword_extractor import KeywordExtractor
+
+
+class FakeToken:
+    def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
+        self.text = text
+        self.pos_ = pos
+        self.lemma_ = lemma
+        self.is_stop = is_stop
+        self.is_punct = not any(ch.isalnum() for ch in text)
+
+
+class FakeNLP:
+    def __init__(self, tag_map, stopwords) -> None:
+        self.tag_map = tag_map
+        self.stopwords = stopwords
+
+    def __call__(self, text: str):
+        tokens = []
+        for raw in text.split():
+            token_text = raw.strip()
+            lowered = token_text.lower()
+            tokens.append(
+                FakeToken(
+                    text=token_text,
+                    pos=self.tag_map.get(lowered, "NOUN"),
+                    lemma=lowered,
+                    is_stop=lowered in self.stopwords,
+                )
+            )
+        return tokens
+
+
+class KeywordExtractorTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        tag_map = {
+            "fiery": "ADJ",
+            "dragon": "NOUN",
+            "attack": "VERB",
+            "explosive": "ADJ",
+            "flames": "NOUN",
+            "burning": "ADJ",
+            "creature": "NOUN",
+            "with": "ADP",
+            "blaze": "NOUN",
+            "power": "NOUN",
+            "electric": "ADJ",
+            "mouse": "NOUN",
+            "using": "VERB",
+            "thunder": "NOUN",
+            "shock": "NOUN",
+            "a": "DET",
+            "very": "ADV",
+            "strong": "ADJ",
+            "and": "CCONJ",
+            "dangerous": "ADJ",
+        }
+
+        stopwords = {"a", "very", "and", "with"}
+        cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
+        cls.extractor = KeywordExtractor(nlp=cls.nlp)
+
+    def test_readme_main_example(self) -> None:
+        text = "fiery dragon attack explosive flames"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])
+
+    def test_synonym_normalization(self) -> None:
+        text = "burning creature with blaze power"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["fire", "creature", "power"])
+
+    def test_mixed_types(self) -> None:
+        text = "electric mouse using thunder shock"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["electric", "mouse", "using"])
+
+    def test_noise_input(self) -> None:
+        text = "a very very strong and dangerous creature"
+        result = self.extractor.extract(text)
+        self.assertEqual(result, ["strong", "dangerous", "creature"])
+
+
+if __name__ == "__main__":
+    unittest.main()