import { ProvenanceKit } from "@provenancekit/sdk";
import { withExtension } from "@provenancekit/extensions";
const pk = new ProvenanceKit({ apiKey: "pk_live_..." });
// 1. Register the training entity (the team / org that trained the model)
const trainer = await pk.entity({
id: "org:acme-ai",
role: "organization",
name: "Acme AI Lab",
});
// 2. Register datasets as resources
// Upload each dataset file to IPFS and record the CID
const datasetA = await pk.file(datasetABuffer, {
name: "web-crawl-2024-q1.parquet",
type: "dataset",
});
const datasetB = await pk.file(datasetBBuffer, {
name: "books-cleaned-v3.parquet",
type: "dataset",
});
// Or reference existing CIDs (datasets you don't own the bytes of)
const openDataset = {
cid: "bafybeig...", // Known CID of e.g. Common Crawl
type: "dataset" as const,
name: "Common Crawl CC-MAIN-2024-10",
};
// 3. Record the training action
const { action } = await pk.activity({
entity: trainer,
action: {
type: "transform", // "transform" = produces a new artifact from inputs
description: "Fine-tune LLaMA 3.1 on curated web + books corpus",
inputs: [
{ cid: datasetA.cid, type: "dataset" },
{ cid: datasetB.cid, type: "dataset" },
{ cid: openDataset.cid, type: "dataset" },
],
extensions: {
"ext:ai@1.0.0": {
provider: "meta",
model: "llama-3.1-8b",
parameters: {
epochs: 3,
learningRate: 2e-5,
batchSize: 32,
},
},
},
},
output: {
file: modelWeightsBuffer, // Upload model weights to IPFS
name: "acme-v1.0.safetensors",
type: "model",
},
});
// 4. The model resource is now the output CID
const modelCid = action.outputs[0];
console.log("Model CID:", modelCid);
// 5. Record license/training opt-out status on each dataset
// (if datasets have known opt-out status)
const datasetMeta = withExtension(
{ id: datasetB.cid, type: "dataset" as const },
"ext:license@1.0.0",
{
spdxId: "CC-BY-4.0",
aiTraining: "permitted", // explicitly permitted for training
hasAITrainingReservation: false,
}
);