spaCy phrases/noun chunks
Sometimes you want to do something with all the noun chunks in a bunch of texts. This script assumes .jsonl as input but will then proceed to generate a collection of noun chunks for downstream analytical tasks.
# /// script
# dependencies = [
# "spacy", "radicli", "srsly"
# ]
# ///
import itertools as it
import json
from pathlib import Path
import spacy
import srsly
from radicli import Radicli, Arg
cli = Radicli()
def _fetch_phrases(stream, nlp, keep_det=False):
for doc in nlp.pipe(stream):
for chunk in doc.noun_chunks:
if keep_det:
yield chunk.text
else:
yield " ".join([t.text for t in chunk if t.pos_ != "DET"])
@cli.command(
# fmt: off
"extract-phrases",
file_in=Arg(help="A .jsonl file with texts"),
output=Arg("--output", help="Output .jsonl with phrases. Will print if not provided"),
model=Arg("--model", help="spaCy model to use"),
n=Arg("--n","-n", help="Only consider top `n` texts."),
keep_det=Arg("--keep-det", help="Keep determinant in phrase.")
# fmt: on
)
def extract_phrases(
file_in: Path,
model: str,
n: int,
keep_det: int,
output: Path = None,
):
"""Turns a `.jsonl` with text into a `.jsonl` with extracted phrases."""
stream = (ex["text"] for ex in srsly.read_jsonl(file_in))
if n:
stream = it.islice(stream, n)
nlp = spacy.load(model, disable=["ents"])
stream = set(_fetch_phrases(stream, nlp, keep_det=keep_det))
stream = ({"text": txt} for txt in stream)
if output:
srsly.write_jsonl(output, stream)
else:
for ex in stream:
print(json.dumps(ex))
if __name__ == "__main__":
cli.run()