Files
Juicepyter/fetch_card.py
2026-03-19 18:16:20 +01:00

147 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""
Download Pokémon TCG card images with embedded JSON metadata.
Uses the TCGdex SDK to:
1. List all sets (with configurable limit)
2. For each set, list all cards (with configurable limit)
3. Download each card image (PNG) and embed full card data as PNG metadata
"""
import json
import io
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import asdict, is_dataclass
from pathlib import Path
from PIL import Image, PngImagePlugin
from tcgdexsdk import TCGdex, Language
from tcgdexsdk.enums import Quality, Extension
# ── Configuration ──────────────────────────────────────────────
MAX_SETS = 10000 # Number of sets to process (None = all)
MAX_CARDS_PER_SET = 10000 # Number of cards per set (None = all)
OUTPUT_DIR = Path(__file__).resolve().parent / "cards"
IMAGE_QUALITY = Quality.HIGH
MAX_WORKERS = 8 # Parallel download threads
# ───────────────────────────────────────────────────────────────
def card_to_dict(card) -> dict:
"""Convert a card object to a JSON-serialisable dict, skipping SDK internals."""
data = {}
skip = {"sdk", "get_image", "get_image_url"}
for attr in dir(card):
if attr.startswith("_") or attr in skip:
continue
val = getattr(card, attr, None)
if callable(val):
continue
data[attr] = _serialise(val)
return data
def _serialise(obj):
"""Recursively convert dataclass / nested objects to plain dicts."""
if obj is None or isinstance(obj, (str, int, float, bool)):
return obj
if is_dataclass(obj) and not isinstance(obj, type):
return {
k: _serialise(v)
for k, v in asdict(obj).items()
if k != "sdk"
}
if isinstance(obj, list):
return [_serialise(i) for i in obj]
if isinstance(obj, dict):
return {k: _serialise(v) for k, v in obj.items()}
# Fallback: try dataclass-style attribute extraction
if hasattr(obj, "__dict__"):
return {
k: _serialise(v)
for k, v in obj.__dict__.items()
if k != "sdk"
}
return str(obj)
def save_image_with_metadata(image_bytes: bytes, metadata: dict, path: Path):
"""Save a PNG image with JSON metadata embedded in a tEXt chunk."""
img = Image.open(io.BytesIO(image_bytes))
png_info = PngImagePlugin.PngInfo()
png_info.add_text("pokemon_metadata", json.dumps(metadata, ensure_ascii=False))
path.parent.mkdir(parents=True, exist_ok=True)
img.save(str(path), "PNG", pnginfo=png_info)
def process_card(card_id: str, set_dir: Path) -> str | None:
"""Fetch card data + image and save. Returns card description on success."""
sdk = TCGdex(Language.EN)
card = sdk.card.getSync(card_id)
if not card:
return None
resp = card.get_image(IMAGE_QUALITY, Extension.PNG)
image_bytes = resp.read()
metadata = card_to_dict(card)
filename = f"{card.localId}.png"
save_image_with_metadata(image_bytes, metadata, set_dir / filename)
return f"{card.name} ({card.id})"
def main():
sdk = TCGdex(Language.EN)
# 1. Get sets
all_sets = sdk.set.listSync()
if not all_sets:
print("No sets returned.")
return
sets_to_process = all_sets[:MAX_SETS] if MAX_SETS else all_sets
print(f"Processing {len(sets_to_process)} / {len(all_sets)} sets\n")
total_downloaded = 0
for si, set_resume in enumerate(sets_to_process, 1):
full_set = sdk.set.getSync(set_resume.id)
if not full_set or not full_set.cards:
print(f"[{si}] {set_resume.name}: no cards, skipping")
continue
cards = full_set.cards[:MAX_CARDS_PER_SET] if MAX_CARDS_PER_SET else full_set.cards
card_total = full_set.cardCount.total if full_set.cardCount else len(full_set.cards)
print(f"[{si}/{len(sets_to_process)}] {set_resume.name}{len(cards)}/{card_total} cards")
set_dir = OUTPUT_DIR / set_resume.id
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
futures = {
pool.submit(process_card, cr.id, set_dir): cr.id
for cr in cards
}
for future in as_completed(futures):
card_id = futures[future]
try:
result = future.result()
if result:
total_downloaded += 1
print(f" {result}")
else:
print(f" {card_id}: skipped")
except Exception as e:
print(f" {card_id}: failed ({e})")
print()
print(f"Done — {total_downloaded} cards saved to {OUTPUT_DIR}")
if __name__ == "__main__":
main()