first commit
This commit is contained in:
111
clean-text-to-keywords/infer_json_usage.py
Normal file
111
clean-text-to-keywords/infer_json_usage.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Sequence
|
||||
|
||||
from keyword_extractor import KeywordExtractor
|
||||
from json_inference import fill_template_from_keywords
|
||||
|
||||
|
||||
def _build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract keywords and infer values into a JSON template.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"text",
|
||||
nargs="*",
|
||||
help="Input description text.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--template",
|
||||
default="",
|
||||
help="Path to JSON template file with keys only. If omitted, full inferred JSON is returned.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default="en_core_web_sm",
|
||||
help="spaCy model name (default: en_core_web_sm).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--keywords",
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="Provide keywords directly instead of raw text.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json-only",
|
||||
action="store_true",
|
||||
help="Print only inferred JSON (skip keyword list output).",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def _load_template(path: str):
|
||||
if not path:
|
||||
return {}
|
||||
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"Template file not found: {path}")
|
||||
|
||||
with open(path, "r", encoding="utf-8") as file_handle:
|
||||
raw = file_handle.read().strip()
|
||||
if not raw:
|
||||
return {}
|
||||
return json.loads(raw)
|
||||
|
||||
|
||||
def _parse_keywords_fragment(raw: str):
|
||||
if not raw.strip():
|
||||
return []
|
||||
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
if isinstance(parsed, list):
|
||||
return [str(item).strip().lower() for item in parsed if str(item).strip()]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
tokens = re.findall(r"[a-zA-Z0-9_-]+", raw.lower())
|
||||
return [token for token in tokens if token]
|
||||
|
||||
|
||||
def _extract_keywords(args):
|
||||
if args.keywords:
|
||||
return [word.strip().lower() for word in args.keywords if word.strip()]
|
||||
|
||||
if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["):
|
||||
raw = " ".join([args.template] + args.text)
|
||||
return _parse_keywords_fragment(raw)
|
||||
|
||||
if not args.text:
|
||||
raise ValueError("Provide input text or use --keywords.")
|
||||
|
||||
text = " ".join(args.text)
|
||||
extractor = KeywordExtractor.from_default_model(model_name=args.model)
|
||||
return extractor.extract(text)
|
||||
|
||||
|
||||
def main(argv: Sequence[str] | None = None) -> None:
|
||||
parser = _build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
keywords = _extract_keywords(args)
|
||||
|
||||
template_path = args.template
|
||||
if args.template and not os.path.exists(args.template) and args.template.lstrip().startswith("["):
|
||||
template_path = ""
|
||||
|
||||
template = _load_template(template_path)
|
||||
inferred_json = fill_template_from_keywords(template, keywords)
|
||||
|
||||
if args.json_only:
|
||||
print(json.dumps(inferred_json, indent=2))
|
||||
return
|
||||
|
||||
print(json.dumps(keywords))
|
||||
print(json.dumps(inferred_json, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user