first commit

This commit is contained in:
2026-03-19 18:16:20 +01:00
commit 584b2e07b4
34 changed files with 4381 additions and 0 deletions

View File

@@ -0,0 +1,189 @@
# Pokemon Text-to-JSON Pipeline
This project converts free-form Pokemon description text into:
1. A normalized keyword list
2. A populated Pokemon JSON object (from a blank/key-only template)
The pipeline is deterministic and rule-based.
## Architecture
### Stage 1: Keyword Extraction
File: `keyword_extractor.py`
Input: raw text description
Core logic:
- spaCy tokenization and POS tagging
- POS filtering (`NOUN`, `ADJ`, `VERB`)
- stopword and punctuation removal
- lemma-based normalization
- domain synonym normalization (example: `flames -> fire`)
- optional YAKE relevance scoring
- conservative retention policy so detail is not over-pruned
Output: ordered list of normalized keywords
### Stage 2: JSON Inference
File: `json_inference.py`
Input: keyword list + optional JSON template
Core logic:
- infer primary/secondary type
- infer name candidate
- infer attacks, abilities, habitat, personality
- infer basic stats (`hp`, `attack`, `defense`, `speed`)
- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.)
- preserve already non-empty values in the provided template
Output: inferred JSON profile
### Stage 3: Orchestration CLI
File: `infer_json_usage.py`
This is the main entrypoint for end-to-end usage.
Default behavior:
1. prints extracted keyword list
2. prints inferred JSON
## Project Structure
- `keyword_extractor.py`: keyword extraction engine
- `json_inference.py`: keyword-to-JSON inference logic
- `infer_json_usage.py`: end-to-end CLI
- `example_usage.py`: keyword extraction only CLI
- `json_template_example.json`: sample blank/key-only template
- `test_keyword_extractor.py`: extraction tests
- `test_json_inference.py`: inference tests
- `requirements.txt`: Python dependencies
## Requirements
- Python 3.13 or lower is recommended for spaCy compatibility
- pip
Dependencies in `requirements.txt`:
- `spacy>=3.7.0`
- `yake>=0.4.2`
## Setup
1. Create and activate a virtual environment (recommended)
```bash
python -m venv .venv
source .venv/bin/activate
```
2. Install dependencies
```bash
pip install -r requirements.txt
```
3. Install spaCy English model
```bash
python -m spacy download en_core_web_sm
```
## How To Run
### A) Extract keywords only
```bash
python example_usage.py "furret long slender agile creature with soft fur"
```
Output: JSON list of keywords.
### B) End-to-end: text -> keywords -> JSON
```bash
python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur"
```
Output order:
1. keyword list
2. inferred JSON
### C) End-to-end but JSON only
```bash
python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur"
```
### D) Start from keywords directly
```bash
python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance
```
Tip: If you pass `--keywords`, text extraction is skipped.
## Template Behavior
If `--template` is omitted, inference returns a full inferred profile object.
If `--template` is provided:
- empty fields are populated from inferred values
- non-empty fields are preserved
Current sample template supports nested card-like data including:
- `types`
- `attacks` with `cost`, `name`, `effect`, `damage`
- `weaknesses` with `type`, `value`
- `stage`, `retreat`, `legal`
## Tests
Run all tests:
```bash
python -m unittest -q
```
## Troubleshooting
### 1) spaCy model not found
Error mentions `en_core_web_sm` not installed.
Fix:
```bash
python -m spacy download en_core_web_sm
```
### 2) spaCy import/runtime problems on very new Python versions
Use Python 3.13 or lower and reinstall requirements.
### 3) `--template` path errors
Ensure `--template` points to a valid file path, for example:
```bash
--template json_template_example.json
```
If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`.
## Design Notes
- deterministic and explainable (no LLM calls)
- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py`
- scoring and template fill rules are intentionally simple and stable for game-content generation

View File

@@ -0,0 +1,36 @@
import argparse
import json
from typing import Sequence
from keyword_extractor import KeywordExtractor
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Extract normalized keywords from cleaned text.",
)
parser.add_argument(
"text",
nargs="+",
help="Input text to process. Pass as one quoted string or multiple words.",
)
parser.add_argument(
"--model",
default="en_core_web_sm",
help="spaCy model name (default: en_core_web_sm).",
)
return parser
def main(argv: Sequence[str] | None = None) -> None:
parser = _build_parser()
args = parser.parse_args(argv)
text = " ".join(args.text)
extractor = KeywordExtractor.from_default_model(model_name=args.model)
keywords = extractor.extract(text)
print(json.dumps(keywords))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,137 @@
"""Rule-based keyword extraction and normalization for Pokemon card generation."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple
# Canonical concept -> synonym list
from typing import Dict, List
DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = {
"normal": ["basic", "common", "regular", "plain"],
"fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"],
"water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"],
"grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"],
"flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"],
"fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"],
"poison": ["toxic", "venom", "acid", "poisonous", "toxin"],
"electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"],
"ground": ["earth", "soil", "sand", "mud", "quake", "dust"],
"rock": ["stone", "boulder", "crystal", "rocky", "pebble"],
"psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"],
"ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"],
"bug": ["insect", "ant", "beetle", "spider", "crawler"],
"ghost": ["spirit", "phantom", "haunt", "shadow", "specter"],
"steel": ["metal", "iron", "armor", "blade", "alloy"],
"dragon": ["drake", "wyrm", "serpent", "legendary"],
"dark": ["shadow", "evil", "night", "doom", "darkness"],
"fairy": ["magic", "magical", "sparkle", "light", "charm"],
}
DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB")
def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]:
"""Build synonym -> canonical mapping for O(1) normalization lookup."""
inverse: Dict[str, str] = {}
for canonical, synonyms in normalization_map.items():
canonical_normalized = canonical.strip().lower()
inverse[canonical_normalized] = canonical_normalized
for synonym in synonyms:
synonym_normalized = synonym.strip().lower()
if synonym_normalized:
inverse[synonym_normalized] = canonical_normalized
return inverse
def _deduplicate_preserve_order(items: Iterable[str]) -> List[str]:
seen: Set[str] = set()
output: List[str] = []
for item in items:
if item not in seen:
seen.add(item)
output.append(item)
return output
@dataclass
class KeywordExtractor:
"""Deterministic spaCy + rule-based keyword extraction pipeline."""
nlp: Any
normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP)
allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS)
def __post_init__(self) -> None:
self._normalization_lookup = _invert_normalization_map(self.normalization_map)
self._allowed_pos_set = set(self.allowed_pos)
@classmethod
def from_default_model(
cls,
model_name: str = "en_core_web_sm",
normalization_map: Optional[Mapping[str, Iterable[str]]] = None,
allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS,
) -> "KeywordExtractor":
"""Initialize extractor with a spaCy English pipeline."""
try:
import spacy
nlp = spacy.load(model_name)
except OSError as exc:
raise OSError(
f"spaCy model '{model_name}' is not installed. "
"Run: python -m spacy download en_core_web_sm"
) from exc
except Exception as exc:
raise RuntimeError(
"spaCy could not be loaded in this Python environment. "
"Try Python 3.13 or lower, then install spaCy and en_core_web_sm."
) from exc
return cls(
nlp=nlp,
normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP,
allowed_pos=allowed_pos,
)
def extract(self, text: str) -> List[str]:
"""Extract and normalize keywords from already-cleaned text."""
if not text or not text.strip():
return []
doc = self.nlp(text)
# Step 1: POS filtering + base normalization to lowercase lemmas/tokens.
raw_keywords: List[str] = []
for token in doc:
if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set:
continue
# Use lemma where possible to collapse inflections.
base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip()
if base:
raw_keywords.append(base)
# Step 2: Deduplicate before domain normalization (as requested in README).
deduplicated = _deduplicate_preserve_order(raw_keywords)
# Step 3: Map variants/synonyms to canonical concepts.
normalized = [self._normalize_keyword(keyword) for keyword in deduplicated]
# Step 4: Deduplicate again, since multiple words can map to one concept.
return _deduplicate_preserve_order(normalized)
def _normalize_keyword(self, keyword: str) -> str:
keyword_lower = keyword.lower()
return self._normalization_lookup.get(keyword_lower, keyword_lower)
def extract_keywords(
text: str,
extractor: Optional[KeywordExtractor] = None,
) -> List[str]:
"""Convenience API to extract keywords with default extractor config."""
active_extractor = extractor or KeywordExtractor.from_default_model()
return active_extractor.extract(text)

View File

@@ -0,0 +1,88 @@
import unittest
from keyword_extractor import KeywordExtractor
class FakeToken:
def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
self.text = text
self.pos_ = pos
self.lemma_ = lemma
self.is_stop = is_stop
self.is_punct = not any(ch.isalnum() for ch in text)
class FakeNLP:
def __init__(self, tag_map, stopwords) -> None:
self.tag_map = tag_map
self.stopwords = stopwords
def __call__(self, text: str):
tokens = []
for raw in text.split():
token_text = raw.strip()
lowered = token_text.lower()
tokens.append(
FakeToken(
text=token_text,
pos=self.tag_map.get(lowered, "NOUN"),
lemma=lowered,
is_stop=lowered in self.stopwords,
)
)
return tokens
class KeywordExtractorTests(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
tag_map = {
"fiery": "ADJ",
"dragon": "NOUN",
"attack": "VERB",
"explosive": "ADJ",
"flames": "NOUN",
"burning": "ADJ",
"creature": "NOUN",
"with": "ADP",
"blaze": "NOUN",
"power": "NOUN",
"electric": "ADJ",
"mouse": "NOUN",
"using": "VERB",
"thunder": "NOUN",
"shock": "NOUN",
"a": "DET",
"very": "ADV",
"strong": "ADJ",
"and": "CCONJ",
"dangerous": "ADJ",
}
stopwords = {"a", "very", "and", "with"}
cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
cls.extractor = KeywordExtractor(nlp=cls.nlp)
def test_readme_main_example(self) -> None:
text = "fiery dragon attack explosive flames"
result = self.extractor.extract(text)
self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])
def test_synonym_normalization(self) -> None:
text = "burning creature with blaze power"
result = self.extractor.extract(text)
self.assertEqual(result, ["fire", "creature", "power"])
def test_mixed_types(self) -> None:
text = "electric mouse using thunder shock"
result = self.extractor.extract(text)
self.assertEqual(result, ["electric", "mouse", "using"])
def test_noise_input(self) -> None:
text = "a very very strong and dangerous creature"
result = self.extractor.extract(text)
self.assertEqual(result, ["strong", "dangerous", "creature"])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,189 @@
# Pokemon Text-to-JSON Pipeline
This project converts free-form Pokemon description text into:
1. A normalized keyword list
2. A populated Pokemon JSON object (from a blank/key-only template)
The pipeline is deterministic and rule-based.
## Architecture
### Stage 1: Keyword Extraction
File: `keyword_extractor.py`
Input: raw text description
Core logic:
- spaCy tokenization and POS tagging
- POS filtering (`NOUN`, `ADJ`, `VERB`)
- stopword and punctuation removal
- lemma-based normalization
- domain synonym normalization (example: `flames -> fire`)
- optional YAKE relevance scoring
- conservative retention policy so detail is not over-pruned
Output: ordered list of normalized keywords
### Stage 2: JSON Inference
File: `json_inference.py`
Input: keyword list + optional JSON template
Core logic:
- infer primary/secondary type
- infer name candidate
- infer attacks, abilities, habitat, personality
- infer basic stats (`hp`, `attack`, `defense`, `speed`)
- fill nested TCG-like template fields (`types`, `attacks`, `weaknesses`, `stage`, `retreat`, etc.)
- preserve already non-empty values in the provided template
Output: inferred JSON profile
### Stage 3: Orchestration CLI
File: `infer_json_usage.py`
This is the main entrypoint for end-to-end usage.
Default behavior:
1. prints extracted keyword list
2. prints inferred JSON
## Project Structure
- `keyword_extractor.py`: keyword extraction engine
- `json_inference.py`: keyword-to-JSON inference logic
- `infer_json_usage.py`: end-to-end CLI
- `example_usage.py`: keyword extraction only CLI
- `json_template_example.json`: sample blank/key-only template
- `test_keyword_extractor.py`: extraction tests
- `test_json_inference.py`: inference tests
- `requirements.txt`: Python dependencies
## Requirements
- Python 3.13 or lower is recommended for spaCy compatibility
- pip
Dependencies in `requirements.txt`:
- `spacy>=3.7.0`
- `yake>=0.4.2`
## Setup
1. Create and activate a virtual environment (recommended)
```bash
python -m venv .venv
source .venv/bin/activate
```
2. Install dependencies
```bash
pip install -r requirements.txt
```
3. Install spaCy English model
```bash
python -m spacy download en_core_web_sm
```
## How To Run
### A) Extract keywords only
```bash
python example_usage.py "furret long slender agile creature with soft fur"
```
Output: JSON list of keywords.
### B) End-to-end: text -> keywords -> JSON
```bash
python infer_json_usage.py --template json_template_example.json "furret long slender agile creature with soft fur"
```
Output order:
1. keyword list
2. inferred JSON
### C) End-to-end but JSON only
```bash
python infer_json_usage.py --json-only --template json_template_example.json "furret long slender agile creature with soft fur"
```
### D) Start from keywords directly
```bash
python infer_json_usage.py --template json_template_example.json --keywords furret normal tail smash tunnel agile cheerful explore endurance
```
Tip: If you pass `--keywords`, text extraction is skipped.
## Template Behavior
If `--template` is omitted, inference returns a full inferred profile object.
If `--template` is provided:
- empty fields are populated from inferred values
- non-empty fields are preserved
Current sample template supports nested card-like data including:
- `types`
- `attacks` with `cost`, `name`, `effect`, `damage`
- `weaknesses` with `type`, `value`
- `stage`, `retreat`, `legal`
## Tests
Run all tests:
```bash
python -m unittest -q
```
## Troubleshooting
### 1) spaCy model not found
Error mentions `en_core_web_sm` not installed.
Fix:
```bash
python -m spacy download en_core_web_sm
```
### 2) spaCy import/runtime problems on very new Python versions
Use Python 3.13 or lower and reinstall requirements.
### 3) `--template` path errors
Ensure `--template` points to a valid file path, for example:
```bash
--template json_template_example.json
```
If your input is already a keyword list, use `--keywords` instead of putting the list in `--template`.
## Design Notes
- deterministic and explainable (no LLM calls)
- domain mappings are easy to extend in `keyword_extractor.py` and `json_inference.py`
- scoring and template fill rules are intentionally simple and stable for game-content generation

View File

@@ -0,0 +1,36 @@
import argparse
import json
from typing import Sequence
from keyword_extractor import KeywordExtractor
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Extract normalized keywords from cleaned text.",
)
parser.add_argument(
"text",
nargs="+",
help="Input text to process. Pass as one quoted string or multiple words.",
)
parser.add_argument(
"--model",
default="en_core_web_sm",
help="spaCy model name (default: en_core_web_sm).",
)
return parser
def main(argv: Sequence[str] | None = None) -> None:
parser = _build_parser()
args = parser.parse_args(argv)
text = " ".join(args.text)
extractor = KeywordExtractor.from_default_model(model_name=args.model)
keywords = extractor.extract(text)
print(json.dumps(keywords))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,111 @@
import argparse
import json
import os
import re
from typing import Sequence
from keyword_extractor import KeywordExtractor
from json_inference import fill_template_from_keywords
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Extract keywords and infer values into a JSON template.",
)
parser.add_argument(
"text",
nargs="*",
help="Input description text.",
)
parser.add_argument(
"--template",
default="",
help="Path to JSON template file with keys only. If omitted, full inferred JSON is returned.",
)
parser.add_argument(
"--model",
default="en_core_web_sm",
help="spaCy model name (default: en_core_web_sm).",
)
parser.add_argument(
"--keywords",
nargs="+",
default=None,
help="Provide keywords directly instead of raw text.",
)
parser.add_argument(
"--json-only",
action="store_true",
help="Print only inferred JSON (skip keyword list output).",
)
return parser
def _load_template(path: str):
if not path:
return {}
if not os.path.exists(path):
raise FileNotFoundError(f"Template file not found: {path}")
with open(path, "r", encoding="utf-8") as file_handle:
raw = file_handle.read().strip()
if not raw:
return {}
return json.loads(raw)
def _parse_keywords_fragment(raw: str):
if not raw.strip():
return []
try:
parsed = json.loads(raw)
if isinstance(parsed, list):
return [str(item).strip().lower() for item in parsed if str(item).strip()]
except json.JSONDecodeError:
pass
tokens = re.findall(r"[a-zA-Z0-9_-]+", raw.lower())
return [token for token in tokens if token]
def _extract_keywords(args):
if args.keywords:
return [word.strip().lower() for word in args.keywords if word.strip()]
if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["):
raw = " ".join([args.template] + args.text)
return _parse_keywords_fragment(raw)
if not args.text:
raise ValueError("Provide input text or use --keywords.")
text = " ".join(args.text)
extractor = KeywordExtractor.from_default_model(model_name=args.model)
return extractor.extract(text)
def main(argv: Sequence[str] | None = None) -> None:
parser = _build_parser()
args = parser.parse_args(argv)
keywords = _extract_keywords(args)
template_path = args.template
if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["):
template_path = ""
template = _load_template(template_path)
inferred_json = fill_template_from_keywords(template, keywords)
if args.json_only:
print(json.dumps(inferred_json, indent=2))
return
print(json.dumps(keywords))
print(json.dumps(inferred_json, indent=2))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,398 @@
"""Infer Pokemon-like JSON values from extracted keywords."""
from __future__ import annotations
from copy import deepcopy
from typing import Any, Dict, Iterable, List, Mapping, Sequence
POKEMON_TYPES = {
"normal",
"fire",
"water",
"grass",
"electric",
"ice",
"fighting",
"poison",
"ground",
"flying",
"psychic",
"bug",
"rock",
"ghost",
"dragon",
"dark",
"steel",
"fairy",
}
HABITAT_KEYWORDS = {
"forest",
"field",
"cave",
"mountain",
"river",
"ocean",
"sea",
"tunnel",
"nest",
"sky",
"desert",
"swamp",
"volcano",
}
PERSONALITY_KEYWORDS = {
"calm",
"gentle",
"agile",
"playful",
"cheerful",
"energetic",
"curious",
"fierce",
"brave",
"loyal",
"timid",
"bold",
}
MOVE_KEYWORDS = {
"attack",
"smash",
"strike",
"kick",
"punch",
"shock",
"thunder",
"bolt",
"blast",
"explosion",
"freeze",
"bite",
"claw",
"tail",
"fight",
}
ABILITY_KEYWORDS = {
"recover",
"endurance",
"explore",
"hide",
"wander",
"bond",
"speed",
"power",
"energy",
"flexible",
}
STAT_HINTS = {
"hp": {"endurance", "recover", "energy", "stamina", "healthy", "vital"},
"attack": {"attack", "smash", "strike", "punch", "kick", "claw", "fight", "power"},
"defense": {"armor", "shield", "tough", "hard", "resist", "solid"},
"speed": {"speed", "swift", "agile", "quick", "fast", "dash"},
}
KEY_ALIASES = {
"name": {"name", "pokemon_name"},
"type": {"type", "primary_type", "pokemon_type"},
"secondary_type": {"secondary_type", "type2", "secondary"},
"attacks": {"attacks", "moves", "skills", "offense"},
"abilities": {"abilities", "traits", "passives", "special_abilities"},
"habitat": {"habitat", "environment", "region"},
"personality": {"personality", "temperament", "nature"},
"description": {"description", "flavor_text", "summary", "lore"},
"keywords": {"keywords", "tags"},
"hp": {"hp", "health", "health_points"},
"attack": {"attack", "atk"},
"defense": {"defense", "def"},
"speed": {"speed", "spd"},
}
GENERIC_NAME_BLACKLIST = {
"black",
"white",
"yellow",
"red",
"blue",
"green",
"purple",
"orange",
"pink",
"gray",
"grey",
"brown",
"fur",
"body",
"tail",
"claw",
"storm",
"cloud",
"enemy",
"super",
"scary",
"giant",
"speed",
}
TYPE_WEAKNESS = {
"normal": "fighting",
"fire": "water",
"water": "electric",
"grass": "fire",
"electric": "ground",
"ice": "fire",
"fighting": "psychic",
"poison": "ground",
"ground": "water",
"flying": "electric",
"psychic": "dark",
"bug": "fire",
"rock": "water",
"ghost": "dark",
"dragon": "fairy",
"dark": "fighting",
"steel": "fire",
"fairy": "steel",
}
def _title_case(value: str) -> str:
return " ".join(part.capitalize() for part in value.split())
def _is_empty_value(value: Any) -> bool:
if value is None:
return True
if isinstance(value, str):
return value.strip() == ""
if isinstance(value, (list, dict, tuple, set)):
return len(value) == 0
return False
def _canonical_key(key: str) -> str:
lowered = key.lower().strip()
for canonical, aliases in KEY_ALIASES.items():
if lowered in aliases:
return canonical
return lowered
def _pick_name(keywords: Sequence[str]) -> str:
for keyword in keywords:
if keyword in POKEMON_TYPES:
continue
if keyword in HABITAT_KEYWORDS:
continue
if keyword in MOVE_KEYWORDS:
continue
if keyword in ABILITY_KEYWORDS:
continue
if keyword in PERSONALITY_KEYWORDS:
continue
if keyword in GENERIC_NAME_BLACKLIST:
continue
if len(keyword) < 4:
continue
return _title_case(keyword)
return "Unknown"
def _pick_types(keywords: Sequence[str]) -> List[str]:
types: List[str] = []
for keyword in keywords:
if keyword in POKEMON_TYPES and keyword not in types:
types.append(keyword)
if len(types) >= 2:
break
if not types:
types.append("normal")
return types
def _pick_habitat(keywords: Sequence[str]) -> str:
habitats = [word for word in keywords if word in HABITAT_KEYWORDS]
if not habitats:
return "unknown"
return habitats[0]
def _pick_personality(keywords: Sequence[str]) -> List[str]:
result: List[str] = []
for keyword in keywords:
if keyword in PERSONALITY_KEYWORDS and keyword not in result:
result.append(keyword)
return result[:3]
def _pick_attacks(keywords: Sequence[str]) -> List[str]:
attacks: List[str] = []
for keyword in keywords:
if keyword in MOVE_KEYWORDS and keyword not in attacks:
attacks.append(keyword)
return attacks[:4]
def _pick_abilities(keywords: Sequence[str]) -> List[str]:
abilities: List[str] = []
for keyword in keywords:
if keyword in ABILITY_KEYWORDS and keyword not in abilities:
abilities.append(keyword)
return abilities[:4]
def _score_stat(base: int, keywords: Sequence[str], hints: Iterable[str]) -> int:
hint_set = set(hints)
matches = sum(1 for keyword in keywords if keyword in hint_set)
# Each match adds 10 points; keep stats in [40, 160].
return max(40, min(160, base + (matches * 10)))
def _build_description(name: str, primary_type: str, attacks: Sequence[str], abilities: Sequence[str], habitat: str) -> str:
attack_text = ", ".join(attacks) if attacks else "basic combat"
ability_text = ", ".join(abilities) if abilities else "balanced adaptation"
return (
f"{name} is a {primary_type}-type Pokemon often found in {habitat}. "
f"It commonly uses {attack_text} and shows abilities like {ability_text}."
)
def _retreat_cost_from_speed(speed: int) -> int:
if speed >= 120:
return 0
if speed >= 90:
return 1
if speed >= 70:
return 2
return 3
def _attack_damage_from_attack_stat(attack_stat: int, index: int) -> int:
# Keep card damage in simple 10-step increments.
base = 30 + max(0, attack_stat - 70) // 2
adjusted = base + (index * 10)
return max(10, min(160, (adjusted // 10) * 10))
def _energy_name_for_type(pokemon_type: str) -> str:
if pokemon_type == "normal":
return "Colorless"
return _title_case(pokemon_type)
def _fill_tcg_like_template(output: Dict[str, Any], inferred: Mapping[str, Any]) -> None:
if "name" in output and _is_empty_value(output.get("name")):
output["name"] = inferred["name"]
if "description" in output and _is_empty_value(output.get("description")):
output["description"] = inferred["description"]
if "hp" in output and _is_empty_value(output.get("hp")):
hp_value = inferred["hp"]
output["hp"] = str(hp_value) if isinstance(output.get("hp"), str) else hp_value
if "types" in output and isinstance(output.get("types"), list):
types_value = output["types"]
if len(types_value) == 0 or all(_is_empty_value(item) for item in types_value):
inferred_types = [inferred["type"]]
if inferred.get("secondary_type"):
inferred_types.append(inferred["secondary_type"])
output["types"] = inferred_types
if "stage" in output and _is_empty_value(output.get("stage")):
output["stage"] = "Basic"
if "retreat" in output and (output.get("retreat") in (None, 0, "")):
output["retreat"] = _retreat_cost_from_speed(int(inferred["speed"]))
if "weaknesses" in output and isinstance(output.get("weaknesses"), list):
weaknesses = output["weaknesses"]
if weaknesses:
weakness_type = TYPE_WEAKNESS.get(inferred["type"], "fighting")
first = weaknesses[0]
if isinstance(first, dict):
if _is_empty_value(first.get("type")):
first["type"] = weakness_type
if _is_empty_value(first.get("value")):
first["value"] = "x2"
if "attacks" in output and isinstance(output.get("attacks"), list):
attack_entries = output["attacks"]
inferred_attacks = inferred["attacks"]
inferred_type = inferred["type"]
for idx, attack_entry in enumerate(attack_entries):
if not isinstance(attack_entry, dict):
continue
attack_name = inferred_attacks[idx] if idx < len(inferred_attacks) else "tackle"
attack_title = _title_case(attack_name)
if _is_empty_value(attack_entry.get("name")):
attack_entry["name"] = attack_title
if _is_empty_value(attack_entry.get("effect")):
attack_entry["effect"] = f"Deals damage with {attack_name}."
if "damage" in attack_entry and (attack_entry.get("damage") in (None, 0, "")):
attack_entry["damage"] = _attack_damage_from_attack_stat(int(inferred["attack"]), idx)
if "cost" in attack_entry and isinstance(attack_entry.get("cost"), list):
current_cost = attack_entry["cost"]
if len(current_cost) == 0 or all(_is_empty_value(item) for item in current_cost):
attack_entry["cost"] = [_energy_name_for_type(inferred_type)]
def infer_profile_from_keywords(keywords: Sequence[str]) -> Dict[str, Any]:
cleaned = [k.strip().lower() for k in keywords if k and k.strip()]
name = _pick_name(cleaned)
types = _pick_types(cleaned)
attacks = _pick_attacks(cleaned)
abilities = _pick_abilities(cleaned)
habitat = _pick_habitat(cleaned)
personality = _pick_personality(cleaned)
hp = _score_stat(70, cleaned, STAT_HINTS["hp"])
attack = _score_stat(70, cleaned, STAT_HINTS["attack"])
defense = _score_stat(70, cleaned, STAT_HINTS["defense"])
speed = _score_stat(70, cleaned, STAT_HINTS["speed"])
return {
"name": name,
"type": types[0],
"secondary_type": types[1] if len(types) > 1 else None,
"attacks": attacks,
"abilities": abilities,
"habitat": habitat,
"personality": personality,
"hp": hp,
"attack": attack,
"defense": defense,
"speed": speed,
"keywords": cleaned,
"description": _build_description(name, types[0], attacks, abilities, habitat),
}
def fill_template_from_keywords(template: Mapping[str, Any], keywords: Sequence[str]) -> Dict[str, Any]:
"""Fill a key-only template by inferring values from keywords.
Existing non-empty values in template are preserved.
"""
inferred = infer_profile_from_keywords(keywords)
output: Dict[str, Any] = deepcopy(dict(template))
if not output:
return inferred
_fill_tcg_like_template(output, inferred)
for key, current_value in output.items():
canonical = _canonical_key(key)
if canonical not in inferred:
continue
if _is_empty_value(current_value):
output[key] = inferred[canonical]
return output

View File

@@ -0,0 +1,35 @@
{
"category": "Pokemon",
"name": "",
"rarity": "",
"hp": "",
"types": [""],
"evolveFrom": "",
"description": "",
"stage": "",
"attacks": [
{
"cost": [""],
"name": "",
"effect": ""
},
{
"cost": [""],
"name": "",
"effect": "",
"damage": 0
}
],
"weaknesses": [
{
"type": "",
"value": ""
}
],
"retreat": 0,
"regulationMark": "",
"legal": {
"standard": true,
"expanded": true
}
}

View File

@@ -0,0 +1,248 @@
"""Rule-based keyword extraction and normalization for Pokemon card generation."""
from __future__ import annotations
import math
import re
from dataclasses import dataclass, field
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple
DEFAULT_NORMALIZATION_MAP: Dict[str, List[str]] = {
"normal": ["basic", "common", "regular", "plain", "normaltype"],
"fire": ["flame", "flames", "burn", "burning", "blaze", "fiery", "heat", "inferno"],
"water": ["wave", "ocean", "sea", "river", "aqua", "splash", "tidal"],
"grass": ["plant", "leaf", "forest", "nature", "vine", "seed", "flora"],
"flying": ["air", "wind", "sky", "wing", "wings", "flight", "soar"],
"fighting": ["punch", "kick", "strike", "martial", "combat", "brawl"],
"poison": ["toxic", "venom", "acid", "poisonous", "toxin"],
"electric": ["lightning", "thunder", "shock", "volt", "spark", "electricity"],
"ground": ["earth", "soil", "sand", "mud", "quake", "dust"],
"rock": ["stone", "boulder", "crystal", "rocky", "pebble"],
"psychic": ["mind", "mental", "telepathy", "psyonic", "brain", "illusion"],
"ice": ["freeze", "frozen", "snow", "frost", "blizzard", "icy"],
"bug": ["insect", "ant", "beetle", "spider", "crawler"],
"ghost": ["spirit", "phantom", "haunt", "shadow", "specter"],
"steel": ["metal", "iron", "armor", "blade", "alloy"],
"dragon": ["drake", "wyrm", "serpent", "legendary"],
"dark": ["shadow", "evil", "night", "doom", "darkness"],
"fairy": ["magic", "magical", "sparkle", "light", "charm"],
"explosion": ["explosive", "explode", "blast"],
}
DEFAULT_ALLOWED_POS: Tuple[str, ...] = ("NOUN", "ADJ", "VERB")
DEFAULT_IGNORED_KEYWORDS: Set[str] = {"preevolution", "pokmon"}
DEFAULT_POS_WEIGHTS: Dict[str, float] = {
"NOUN": 3.0,
"ADJ": 2.0,
"VERB": 1.0,
}
DEFAULT_KEEP_RATIO = 0.8
DEFAULT_MIN_KEYWORDS = 12
DEFAULT_MAX_KEYWORDS = 30
def _invert_normalization_map(normalization_map: Mapping[str, Iterable[str]]) -> Dict[str, str]:
"""Build synonym -> canonical mapping for O(1) normalization lookup."""
inverse: Dict[str, str] = {}
for canonical, synonyms in normalization_map.items():
canonical_normalized = canonical.strip().lower()
inverse[canonical_normalized] = canonical_normalized
for synonym in synonyms:
synonym_normalized = synonym.strip().lower()
if synonym_normalized:
inverse[synonym_normalized] = canonical_normalized
return inverse
def _tokenize_keyword_phrase(value: str) -> List[str]:
return re.findall(r"[a-z0-9]+", value.lower())
@dataclass
class KeywordExtractor:
"""Deterministic spaCy + YAKE + rule-based normalization pipeline."""
nlp: Any
normalization_map: Mapping[str, Iterable[str]] = field(default_factory=lambda: DEFAULT_NORMALIZATION_MAP)
allowed_pos: Sequence[str] = field(default_factory=lambda: DEFAULT_ALLOWED_POS)
ignored_keywords: Set[str] = field(default_factory=lambda: set(DEFAULT_IGNORED_KEYWORDS))
pos_weights: Mapping[str, float] = field(default_factory=lambda: DEFAULT_POS_WEIGHTS)
keep_ratio: float = DEFAULT_KEEP_RATIO
min_keywords: int = DEFAULT_MIN_KEYWORDS
max_keywords: int = DEFAULT_MAX_KEYWORDS
use_yake: bool = True
def __post_init__(self) -> None:
self._normalization_lookup = _invert_normalization_map(self.normalization_map)
self._allowed_pos_set = set(self.allowed_pos)
self._ignored_keywords = {keyword.lower().strip() for keyword in self.ignored_keywords}
self._pos_weight_lookup = {k.upper(): float(v) for k, v in self.pos_weights.items()}
@classmethod
def from_default_model(
cls,
model_name: str = "en_core_web_sm",
normalization_map: Optional[Mapping[str, Iterable[str]]] = None,
allowed_pos: Sequence[str] = DEFAULT_ALLOWED_POS,
ignored_keywords: Optional[Set[str]] = None,
pos_weights: Mapping[str, float] = DEFAULT_POS_WEIGHTS,
keep_ratio: float = DEFAULT_KEEP_RATIO,
min_keywords: int = DEFAULT_MIN_KEYWORDS,
max_keywords: int = DEFAULT_MAX_KEYWORDS,
use_yake: bool = True,
) -> "KeywordExtractor":
"""Initialize extractor with a spaCy English pipeline."""
try:
import spacy
nlp = spacy.load(model_name)
except OSError as exc:
raise OSError(
f"spaCy model '{model_name}' is not installed. "
"Run: python -m spacy download en_core_web_sm"
) from exc
except Exception as exc:
raise RuntimeError(
"spaCy could not be loaded in this Python environment. "
"Try Python 3.13 or lower, then install spaCy and en_core_web_sm."
) from exc
return cls(
nlp=nlp,
normalization_map=normalization_map or DEFAULT_NORMALIZATION_MAP,
allowed_pos=allowed_pos,
ignored_keywords=ignored_keywords or set(DEFAULT_IGNORED_KEYWORDS),
pos_weights=pos_weights,
keep_ratio=keep_ratio,
min_keywords=min_keywords,
max_keywords=max_keywords,
use_yake=use_yake,
)
def extract(self, text: str) -> List[str]:
"""Extract, normalize and rank keywords from already-cleaned text."""
if not text or not text.strip():
return []
doc = self.nlp(text)
# Step 1: POS filtering + lowercase lemma/token extraction.
raw_keywords: List[Tuple[str, str]] = []
for token in doc:
if token.is_stop or token.is_punct or token.pos_ not in self._allowed_pos_set:
continue
base = token.lemma_.lower().strip() if token.lemma_ and token.lemma_ != "-PRON-" else token.text.lower().strip()
if base and base not in self._ignored_keywords:
raw_keywords.append((base, token.pos_))
# Step 2: Deduplicate before domain normalization.
deduplicated: List[Tuple[str, str]] = []
seen_raw: Set[str] = set()
for keyword, pos in raw_keywords:
if keyword in seen_raw:
continue
seen_raw.add(keyword)
deduplicated.append((keyword, pos))
# Step 3: Normalize and deduplicate canonical forms.
unique_entries: List[Tuple[str, str, str, int]] = []
seen_normalized: Set[str] = set()
for index, (original_keyword, pos) in enumerate(deduplicated):
normalized_keyword = self._normalize_keyword(original_keyword)
if normalized_keyword in seen_normalized:
continue
seen_normalized.add(normalized_keyword)
unique_entries.append((original_keyword, normalized_keyword, pos, index))
if not unique_entries:
return []
if not self.use_yake:
return [normalized_keyword for _, normalized_keyword, _, _ in unique_entries]
# Step 4: YAKE scoring + conservative selection to preserve detail.
yake_scores = self._extract_yake_scores(text)
if not yake_scores:
return [normalized_keyword for _, normalized_keyword, _, _ in unique_entries]
ranked: List[Tuple[float, int, str]] = []
for original_keyword, normalized_keyword, pos, index in unique_entries:
score_candidates: List[float] = []
if original_keyword in yake_scores:
score_candidates.append(yake_scores[original_keyword])
if normalized_keyword in yake_scores:
score_candidates.append(yake_scores[normalized_keyword])
# Missing score is treated as moderately relevant to avoid over-pruning.
yake_penalty = min(score_candidates) if score_candidates else 0.45
pos_weight = self._pos_weight_lookup.get(pos.upper(), 1.0)
combined_score = (1.0 - yake_penalty) * pos_weight
ranked.append((combined_score, index, normalized_keyword))
target_count = self._compute_target_count(len(ranked))
ranked.sort(key=lambda item: (-item[0], item[1]))
selected = ranked[:target_count]
selected.sort(key=lambda item: item[1])
return [keyword for _, _, keyword in selected]
def _compute_target_count(self, total_keywords: int) -> int:
if total_keywords <= 0:
return 0
target = max(self.min_keywords, math.ceil(total_keywords * self.keep_ratio))
if self.max_keywords > 0:
target = min(target, self.max_keywords)
return min(target, total_keywords)
def _extract_yake_scores(self, text: str) -> Dict[str, float]:
try:
import yake
except Exception:
return {}
text_token_count = len(text.split())
top_n = max(20, min(80, text_token_count * 2))
try:
extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, top=top_n)
phrase_scores = extractor.extract_keywords(text)
except Exception:
return {}
token_scores: Dict[str, float] = {}
for phrase, score in phrase_scores:
for token in _tokenize_keyword_phrase(phrase):
existing = token_scores.get(token)
if existing is None or score < existing:
token_scores[token] = score
if not token_scores:
return {}
values = list(token_scores.values())
min_score = min(values)
max_score = max(values)
if math.isclose(min_score, max_score):
return {token: 0.5 for token in token_scores}
# Normalize so 0.0=most important and 1.0=least important.
return {
token: (score - min_score) / (max_score - min_score)
for token, score in token_scores.items()
}
def _normalize_keyword(self, keyword: str) -> str:
keyword_lower = keyword.lower()
return self._normalization_lookup.get(keyword_lower, keyword_lower)
def extract_keywords(
text: str,
extractor: Optional[KeywordExtractor] = None,
) -> List[str]:
"""Convenience API to extract keywords with default extractor config."""
active_extractor = extractor or KeywordExtractor.from_default_model()
return active_extractor.extract(text)

View File

@@ -0,0 +1,2 @@
spacy>=3.7.0
yake>=0.4.2

View File

@@ -0,0 +1,143 @@
import unittest
from json_inference import fill_template_from_keywords, infer_profile_from_keywords
class JsonInferenceTests(unittest.TestCase):
def test_profile_inference_basics(self) -> None:
keywords = [
"zapthorn",
"electric",
"wolf",
"thunder",
"claw",
"speed",
"storm",
"agile",
"forest",
"recover",
"energy",
]
profile = infer_profile_from_keywords(keywords)
self.assertEqual(profile["name"], "Zapthorn")
self.assertEqual(profile["type"], "electric")
self.assertIn("thunder", profile["attacks"])
self.assertIn("claw", profile["attacks"])
self.assertIn("recover", profile["abilities"])
self.assertEqual(profile["habitat"], "forest")
self.assertGreaterEqual(profile["speed"], 80)
def test_fill_key_only_template(self) -> None:
template = {
"name": "",
"type": "",
"secondary_type": None,
"attacks": [],
"abilities": [],
"habitat": "",
"personality": [],
"hp": None,
"attack": None,
"defense": None,
"speed": None,
"description": "",
"keywords": [],
}
keywords = [
"furret",
"normal",
"tail",
"smash",
"tunnel",
"agile",
"cheerful",
"explore",
"endurance",
]
result = fill_template_from_keywords(template, keywords)
self.assertEqual(result["name"], "Furret")
self.assertEqual(result["type"], "normal")
self.assertIn("smash", result["attacks"])
self.assertIn("explore", result["abilities"])
self.assertEqual(result["habitat"], "tunnel")
self.assertIn("cheerful", result["personality"])
self.assertIsInstance(result["description"], str)
self.assertGreater(len(result["description"]), 20)
def test_fill_tcg_style_template(self) -> None:
template = {
"category": "Pokemon",
"name": "",
"hp": "",
"types": [""],
"description": "",
"stage": "",
"attacks": [
{"cost": [""], "name": "", "effect": ""},
{"cost": [""], "name": "", "effect": "", "damage": 0},
],
"weaknesses": [{"type": "", "value": ""}],
"retreat": 0,
}
keywords = [
"zapthorn",
"electric",
"thunder",
"claw",
"speed",
"storm",
"energy",
]
result = fill_template_from_keywords(template, keywords)
self.assertEqual(result["name"], "Zapthorn")
self.assertEqual(result["types"], ["electric"])
self.assertEqual(result["stage"], "Basic")
self.assertTrue(result["hp"].isdigit())
self.assertEqual(result["weaknesses"][0]["type"], "ground")
self.assertEqual(result["weaknesses"][0]["value"], "x2")
self.assertEqual(result["attacks"][0]["name"], "Thunder")
self.assertEqual(result["attacks"][1]["name"], "Claw")
self.assertEqual(result["attacks"][0]["cost"], ["Electric"])
self.assertGreaterEqual(result["retreat"], 0)
def test_name_fallback_to_unknown_for_generic_tokens(self) -> None:
keywords = [
"black",
"fur",
"giant",
"electric",
"claw",
"speed",
"storm",
]
profile = infer_profile_from_keywords(keywords)
self.assertEqual(profile["name"], "Unknown")
def test_preserves_existing_values(self) -> None:
template = {
"name": "CustomName",
"type": "electric",
"attacks": [],
"description": "Already set",
}
keywords = ["furret", "normal", "attack"]
result = fill_template_from_keywords(template, keywords)
self.assertEqual(result["name"], "CustomName")
self.assertEqual(result["type"], "electric")
self.assertEqual(result["description"], "Already set")
self.assertIn("attack", result["attacks"])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,166 @@
import unittest
from keyword_extractor import KeywordExtractor
class FakeToken:
def __init__(self, text: str, pos: str, lemma: str, is_stop: bool) -> None:
self.text = text
self.pos_ = pos
self.lemma_ = lemma
self.is_stop = is_stop
self.is_punct = not any(ch.isalnum() for ch in text)
class FakeNLP:
def __init__(self, tag_map, stopwords) -> None:
self.tag_map = tag_map
self.stopwords = stopwords
def __call__(self, text: str):
tokens = []
for raw in text.split():
token_text = raw.strip()
lowered = token_text.lower()
tokens.append(
FakeToken(
text=token_text,
pos=self.tag_map.get(lowered, "NOUN"),
lemma=lowered,
is_stop=lowered in self.stopwords,
)
)
return tokens
class TestableKeywordExtractor(KeywordExtractor):
def __init__(self, *args, yake_scores=None, **kwargs):
super().__init__(*args, **kwargs)
self._test_yake_scores = yake_scores or {}
def _extract_yake_scores(self, text: str):
return self._test_yake_scores
class KeywordExtractorTests(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
tag_map = {
"fiery": "ADJ",
"dragon": "NOUN",
"attack": "VERB",
"explosive": "ADJ",
"flames": "NOUN",
"burning": "ADJ",
"creature": "NOUN",
"with": "ADP",
"blaze": "NOUN",
"and": "CCONJ",
"dangerous": "ADJ",
"electric": "ADJ",
"mouse": "NOUN",
"using": "VERB",
"thunder": "NOUN",
"shock": "NOUN",
"strong": "ADJ",
"furret": "NOUN",
"long": "ADJ",
"slender": "ADJ",
"soft": "ADJ",
"fur": "NOUN",
"flexible": "ADJ",
"body": "NOUN",
"move": "VERB",
"gracefully": "ADJ",
"narrow": "ADJ",
"tunnel": "NOUN",
"tail": "NOUN",
"smash": "VERB",
"opponent": "NOUN",
"battle": "NOUN",
"cheerful": "ADJ",
"endurance": "NOUN",
}
stopwords = {
"a",
"very",
"and",
"with",
"the",
"it",
"to",
"its",
"that",
"through",
"in",
}
cls.nlp = FakeNLP(tag_map=tag_map, stopwords=stopwords)
cls.extractor = KeywordExtractor(nlp=cls.nlp, use_yake=False)
def test_readme_main_example(self) -> None:
text = "fiery dragon attack explosive flames"
result = self.extractor.extract(text)
self.assertEqual(result, ["fire", "dragon", "attack", "explosion"])
def test_synonym_normalization(self) -> None:
text = "burning creature with blaze power"
result = self.extractor.extract(text)
self.assertEqual(result, ["fire", "creature", "power"])
def test_mixed_types(self) -> None:
text = "electric mouse using thunder shock"
result = self.extractor.extract(text)
self.assertEqual(result, ["electric", "mouse", "using"])
def test_noise_input(self) -> None:
text = "a very very strong and dangerous creature"
result = self.extractor.extract(text)
self.assertEqual(result, ["strong", "dangerous", "creature"])
def test_yake_keeps_detailed_information(self) -> None:
text = (
"furret long slender creature soft fur flexible body move gracefully narrow tunnel "
"tail smash opponent battle cheerful endurance"
)
yake_scores = {
"furret": 0.00,
"creature": 0.05,
"tail": 0.08,
"battle": 0.10,
"smash": 0.12,
"tunnel": 0.14,
"endurance": 0.18,
"body": 0.20,
"cheerful": 0.22,
"slender": 0.26,
"flexible": 0.28,
"gracefully": 0.34,
"narrow": 0.40,
"long": 0.42,
"soft": 0.44,
"fur": 0.45,
"move": 0.48,
"opponent": 0.52,
}
extractor = TestableKeywordExtractor(
nlp=self.nlp,
use_yake=True,
keep_ratio=0.8,
min_keywords=10,
max_keywords=30,
yake_scores=yake_scores,
)
result = extractor.extract(text)
self.assertGreaterEqual(len(result), 10)
self.assertIn("furret", result)
self.assertIn("creature", result)
self.assertIn("tail", result)
self.assertIn("tunnel", result)
if __name__ == "__main__":
unittest.main()