Model Graph-RAG Dataset Lineage

Connect sources, chunks, extracted entities, evidence spans, and pipeline provenance so retrieval artifacts remain auditable.

Core Terms

gmeow:Dataset, gmeow:Chunk, gmeow:ExtractedEntity, gmeow:EvidenceSpan

Read Next

Examples

Lillith Dataset

# SPDX-FileCopyrightText: 2026 Blackcat Informatics® Inc. <paudley@blackcatinformatics.ca>
# SPDX-License-Identifier: CC-BY-4.0
#
# The dataset descriptor for the Lillith worked example: the
# gmeow:Dataset node that the research-object exports (Croissant, RO-Crate,
# DCAT, DataCite, Frictionless) read their catalog metadata FROM — title,
# description, licence, attribution, publication date. Canonical instance
# data; every export is a generated lossy projection of it (P4/P5).
@prefix gmeow: <https://blackcatinformatics.ca/gmeow/> .
@prefix ex:    <https://blackcatinformatics.ca/gmeow/examples/graphrag/> .
@prefix rdfs:  <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd:   <http://www.w3.org/2001/XMLSchema#> .

ex:lillith-benchmark a gmeow:Dataset ;
    rdfs:label "Lillith GraphRAG benchmark"@en ;
    gmeow:title "Lillith GraphRAG benchmark"@en ;
    gmeow:description "A worked GraphRAG benchmark dataset: a content-addressed corpus, its chunking, embeddings, vector index, retrieval events, and model-extracted entity/relationship descriptions — every artifact attributed and confidence-weighted, published as a research object."@en ;
    gmeow:hasPart ex:corpus-lillith ;
    gmeow:hasLicense ex:lillith-license ;
    gmeow:wasAttributedTo ex:blackcat ;
    gmeow:datePublished "2026-06-12T00:00:00Z"^^xsd:dateTime ;
    gmeow:sourceLocation "https://blackcatinformatics.ca/gmeow/examples/graphrag/lillith-benchmark" .

ex:lillith-license a gmeow:License ;
    rdfs:label "CC BY 4.0"@en ;
    gmeow:licensor ex:blackcat ;
    gmeow:licensedWork ex:lillith-benchmark ;
    gmeow:licenseFamily gmeow:licenseFamilyCC ;
    gmeow:spdxLicenseId "CC-BY-4.0" ;
    gmeow:spdxLicenseName "Creative Commons Attribution 4.0 International" .

ex:blackcat a gmeow:Organization ;
    rdfs:label "Blackcat Informatics® Inc."@en .

# The ingest provenance the catalog projections flatten to PROV.
ex:lillith-ingest a gmeow:ImportActivity ;
    rdfs:label "lillith corpus ingest"@en ;
    gmeow:ingestedAt "2026-06-01T09:00:00Z"^^xsd:dateTime ;
    gmeow:eventTemporalFrame gmeow:temporalFrameUTCGregorian .

# --- The pipeline run as a verifiable workflow (Workflow Run Crate model, the Workflow
# Run Crate substrate): the extraction pipeline lives in a repository, its
# workflow definition is the buildConfigUri, the run is a BuildActivity
# performed by a Builder, and the published crate is the Distribution.
ex:pipeline-repo a gmeow:Repository ;
    rdfs:label "lillith-pipeline repository"@en ;
    gmeow:repositoryType gmeow:repoTypeGit ;
    gmeow:cloneUrl "https://example.org/lillith-pipeline.git";
    gmeow:webUrl "https://example.org/lillith-pipeline".

ex:pipeline-runner a gmeow:Builder ;
    rdfs:label "lillith pipeline runner"@en .

ex:pipeline-run a gmeow:BuildActivity ;
    rdfs:label "lillith benchmark pipeline run 2026-06-02"@en ;
    gmeow:buildSource ex:pipeline-repo ;
    gmeow:buildOutput ex:lillith-crate ;
    gmeow:buildConfigUri "https://example.org/lillith-pipeline/blob/main/ci/workflows/benchmark.yml";
    gmeow:hasParticipant ex:pipeline-runner ;
    gmeow:eventTime "2026-06-02T08:00:00Z"^^xsd:dateTime ;
    gmeow:eventTemporalFrame gmeow:temporalFrameUTCGregorian .

ex:lillith-crate a gmeow:Distribution ;
    rdfs:label "lillith.crate.zip"@en ;
    gmeow:contentDigest "blake3:8888999900001111222233334444555566667777aaaabbbbccccddddeeeeff66" .

Lillith Pipeline

# SPDX-FileCopyrightText: 2026 Blackcat Informatics® Inc. <paudley@blackcatinformatics.ca>
# SPDX-License-Identifier: CC-BY-4.0
#
# Worked example: a Project-Lillith-shaped pipeline, end to end — every
# artifact content-addressed, every step attributed via the EXISTING
# provenance properties, the derived entity graph auditable and revisable.
@prefix gmeow: <https://blackcatinformatics.ca/gmeow/> .
@prefix ex:    <https://blackcatinformatics.ca/gmeow/examples/graphrag/> .
@prefix rdfs:  <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd:   <http://www.w3.org/2001/XMLSchema#> .

# --- Corpus and chunking (the core ai slice's Chunk).
ex:mail-archive a gmeow:Document ;
    rdfs:label "list archive, 2025"@en ;
    gmeow:contentDigest "blake3:aa20bb31cc42dd53ee64ff750086119722a833b944c055d166e277f388a499b0" .

ex:corpus-lillith a gmeow:Corpus ;
    rdfs:label "Lillith working corpus"@en ;
    gmeow:corpusMember ex:mail-archive ;
    gmeow:contentDigest "blake3:0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" .

ex:chunk-7 a gmeow:Chunk ;
    gmeow:chunkOf ex:mail-archive ;
    gmeow:spanStart "5200"^^xsd:nonNegativeInteger ;
    gmeow:spanEnd "6100"^^xsd:nonNegativeInteger ;
    gmeow:contentDigest "blake3:fedcba9876543210fedcba9876543210fedcba9876543210fedcba9876543210" .

# --- Embedding + index: attributed, metric-explicit, vector OUTSIDE (P12).
ex:embedder a gmeow:SoftwareAgent ; rdfs:label "embedder-v3"@en .

ex:embed-run a gmeow:Activity ; rdfs:label "embedding pass 2026-06-01"@en ;
    gmeow:eventTime "2026-06-01T00:00:00Z"^^xsd:dateTime ;
    gmeow:eventTemporalFrame gmeow:temporalFrameUTCGregorian .

ex:embedding-7 a gmeow:Embedding ;
    gmeow:embeddingOf ex:chunk-7 ;
    gmeow:embeddingModel ex:embedder ;
    gmeow:embeddingDimensions "1024"^^xsd:positiveInteger ;
    gmeow:distanceMetric gmeow:distanceMetricCosine ;
    gmeow:vectorRef "s3://lillith/vectors/chunk-7"^^xsd:anyURI ;
    gmeow:wasGeneratedBy ex:embed-run ;
    gmeow:wasDerivedFrom ex:chunk-7 ;
    gmeow:contentDigest "blake3:1111222233334444555566667777888899990000aaaabbbbccccddddeeeeffff" .

ex:index-build a gmeow:Activity ; rdfs:label "index build 2026-06-01"@en ;
    gmeow:eventTime "2026-06-01T00:00:00Z"^^xsd:dateTime ;
    gmeow:eventTemporalFrame gmeow:temporalFrameUTCGregorian .

ex:index-lillith a gmeow:VectorIndex ;
    gmeow:contentDigest "blake3:2222333344445555666677778888999900001111aaaabbbbccccddddeeeeff00" ;
    gmeow:indexesCorpus ex:corpus-lillith ;
    gmeow:indexAlgorithm gmeow:indexAlgorithmHnsw ;
    gmeow:distanceMetric gmeow:distanceMetricCosine ;
    gmeow:indexParameters "{\"M\": 16, \"efConstruction\": 200}" ;
    gmeow:wasGeneratedBy ex:index-build .

# --- Retrieval: why did the model see this passage?
ex:retrieval-3 a gmeow:RetrievalEvent ;
    gmeow:forQuery "who maintained the build system?" ;
    gmeow:againstIndex ex:index-lillith ;
    gmeow:retrievedChunk ex:chunk-7 ;
    gmeow:atTime "2026-06-02T10:00:00Z"^^xsd:dateTime ;
    gmeow:eventTemporalFrame gmeow:temporalFrameUTCGregorian .

# --- Extraction (core ModelInvocation) → derived DESCRIPTIONS.
ex:extractor a gmeow:SoftwareAgent ; rdfs:label "extraction model"@en .
ex:invocation-44 a gmeow:ModelInvocation ;
    gmeow:usedModel ex:extractor ;
    gmeow:samplingTemperature 0.0 ;
    gmeow:atTime "2026-06-01T12:00:00Z"^^xsd:dateTime ;
    gmeow:eventTemporalFrame gmeow:temporalFrameUTCGregorian .

ex:desc-mara a gmeow:ExtractedEntity ;
    rdfs:label "extracted: 'Mara' (maintainer?)"@en ;
    gmeow:contentDigest "blake3:3333444455556666777788889999000011112222aaaabbbbccccddddeeeeff11" ;
    gmeow:wasDerivedFrom ex:chunk-7 ;
    gmeow:wasGeneratedBy ex:invocation-44 .

ex:desc-buildsys a gmeow:ExtractedEntity ;
    rdfs:label "extracted: 'the build system'"@en ;
    gmeow:contentDigest "blake3:4444555566667777888899990000111122223333aaaabbbbccccddddeeeeff22" ;
    gmeow:wasDerivedFrom ex:chunk-7 ;
    gmeow:wasGeneratedBy ex:invocation-44 .

ex:rel-maintains a gmeow:ExtractedRelationship ;
    rdfs:label "extracted: Mara maintains the build system"@en ;
    gmeow:contentDigest "blake3:5555666677778888999900001111222233334444aaaabbbbccccddddeeeeff33" ;
    gmeow:relationshipSource ex:desc-mara ;
    gmeow:relationshipTarget ex:desc-buildsys ;
    gmeow:wasDerivedFrom ex:chunk-7 ;
    gmeow:wasGeneratedBy ex:invocation-44 .

# --- Community + summary: the global-question substrate, revisable.
ex:cluster-run a gmeow:Activity ; rdfs:label "leiden clustering 2026-06-02"@en ;
    gmeow:eventTime "2026-06-02T00:00:00Z"^^xsd:dateTime ;
    gmeow:eventTemporalFrame gmeow:temporalFrameUTCGregorian .

ex:community-infra a gmeow:Community ;
    rdfs:label "infrastructure community"@en ;
    gmeow:contentDigest "blake3:6666777788889999000011112222333344445555aaaabbbbccccddddeeeeff44" ;
    gmeow:communityLevel "0"^^xsd:nonNegativeInteger ;
    gmeow:communityMember ex:desc-mara, ex:desc-buildsys ;
    gmeow:wasGeneratedBy ex:cluster-run .

ex:summary-infra a gmeow:CommunitySummary ;
    rdfs:label "summary: the infrastructure crew"@en ;
    gmeow:contentDigest "blake3:7777888899990000111122223333444455556666aaaabbbbccccddddeeeeff55" ;
    gmeow:summarizesCommunity ex:community-infra ;
    gmeow:wasDerivedFrom ex:desc-mara, ex:desc-buildsys, ex:chunk-7 ;
    gmeow:wasGeneratedBy ex:invocation-44 .