import { SolVec } from '@veclabs/solvec';
import * as fs from 'fs';
import * as path from 'path';
// ── Configuration ──────────────────────────────────────
const CHUNK_SIZE = 400; // words per chunk
const CHUNK_OVERLAP = 50; // word overlap between chunks
const TOP_K = 5; // chunks to retrieve per query
const MIN_SCORE = 0.72; // minimum similarity threshold
const DIMENSIONS = 1536; // match your embedding model
// ── Types ──────────────────────────────────────────────
interface Document {
id: string;
content: string;
filename: string;
metadata?: Record<string, string>;
}
interface Chunk {
id: string;
text: string;
docId: string;
filename: string;
chunkIndex: number;
totalChunks: number;
}
interface RAGResult {
answer: string;
sources: Array<{
filename: string;
chunkIndex: number;
score: number;
excerpt: string;
}>;
proof?: {
verified: boolean;
explorerUrl: string;
};
}
// ── Chunking ───────────────────────────────────────────
function chunkDocument(doc: Document): Chunk[] {
const words = doc.content.split(/\s+/);
const chunks: Chunk[] = [];
let i = 0;
let chunkIndex = 0;
// Estimate total chunks
const totalChunks = Math.ceil(
(words.length - CHUNK_OVERLAP) / (CHUNK_SIZE - CHUNK_OVERLAP)
);
while (i < words.length) {
const chunkWords = words.slice(i, i + CHUNK_SIZE);
const text = chunkWords.join(' ');
chunks.push({
id: `${doc.id}__chunk_${chunkIndex}`,
text,
docId: doc.id,
filename: doc.filename,
chunkIndex,
totalChunks,
});
i += CHUNK_SIZE - CHUNK_OVERLAP;
chunkIndex++;
}
return chunks;
}
// ── Indexing ───────────────────────────────────────────
async function indexDocuments(
documents: Document[],
collection: any,
embedFn: (texts: string[]) => Promise<number[][]>
) {
console.log(`Indexing ${documents.length} documents...`);
// Chunk all documents
const allChunks = documents.flatMap(doc => chunkDocument(doc));
console.log(`Created ${allChunks.length} chunks.`);
// Process in batches to avoid memory issues
const BATCH_SIZE = 100;
let indexed = 0;
for (let i = 0; i < allChunks.length; i += BATCH_SIZE) {
const batch = allChunks.slice(i, i + BATCH_SIZE);
// Embed
const embeddings = await embedFn(batch.map(c => c.text));
// Upsert to VecLabs
await collection.upsert(
batch.map((chunk, j) => ({
id: chunk.id,
values: embeddings[j],
metadata: {
text: chunk.text,
docId: chunk.docId,
filename: chunk.filename,
chunkIndex: chunk.chunkIndex,
totalChunks: chunk.totalChunks,
}
}))
);
indexed += batch.length;
console.log(`Indexed ${indexed} / ${allChunks.length} chunks`);
}
// Verify integrity after indexing
const proof = await collection.verify();
console.log(`\nIndex verified on-chain: ${proof.verified}`);
console.log(`Explorer: ${proof.solanaExplorerUrl}`);
return { chunksIndexed: allChunks.length, proof };
}
// ── Retrieval + Generation ─────────────────────────────
async function query(
question: string,
collection: any,
embedFn: (text: string) => Promise<number[]>,
llmFn: (prompt: string) => Promise<string>
): Promise<RAGResult> {
// 1. Embed question
const queryEmbedding = await embedFn(question);
// 2. Retrieve relevant chunks
const results = await collection.query({
vector: queryEmbedding,
topK: TOP_K,
minScore: MIN_SCORE,
});
// 3. Handle no results
if (results.length === 0) {
return {
answer: "I couldn't find relevant information to answer that question in the provided documents.",
sources: [],
};
}
// 4. Build context
const contextBlocks = results.map((r: any, i: number) =>
`[${i + 1}] From "${r.metadata.filename}" (chunk ${r.metadata.chunkIndex + 1}/${r.metadata.totalChunks}):\n${r.metadata.text}`
);
const prompt = `You are a helpful assistant. Answer the question based only on the provided context.
If the context doesn't contain enough information, say so clearly.
Do not make up information not present in the context.
At the end, cite the source numbers you used.
Context:
${contextBlocks.join('\n\n')}
Question: ${question}
Answer:`;
// 5. Generate
const answer = await llmFn(prompt);
// 6. Build source list
const sources = results.map((r: any) => ({
filename: r.metadata.filename,
chunkIndex: r.metadata.chunkIndex,
score: Math.round(r.score \* 1000) / 1000,
excerpt: r.metadata.text.slice(0, 150) + '...',
}));
return { answer, sources };
}
// ── Main ───────────────────────────────────────────────
async function main() {
const sv = new SolVec({ network: 'devnet' });
const collection = sv.collection('my-knowledge-base', { dimensions: DIMENSIONS });
// Plug in your embedding function
const embedFn = async (texts: string | string[]): Promise<any> => {
const input = Array.isArray(texts) ? texts : [texts];
// Replace with your embedding provider (OpenAI, Cohere, etc.)
return input.map(() => Array(DIMENSIONS).fill(0).map(() => Math.random()));
};
// Plug in your LLM function
const llmFn = async (prompt: string): Promise<string> => {
// Replace with your LLM provider (OpenAI, Anthropic, Gemini, etc.)
return "Placeholder answer from LLM";
};
// Index documents
const documents: Document[] = [
{
id: 'doc_001',
filename: 'veclabs-whitepaper.txt',
content: fs.readFileSync('veclabs-whitepaper.txt', 'utf-8'),
}
];
await indexDocuments(documents, collection, embedFn);
// Query
const result = await query(
'How does VecLabs ensure data privacy?',
collection,
(text) => embedFn(text).then(r => r[0]),
llmFn
);
console.log('\nAnswer:', result.answer);
console.log('\nSources:');
result.sources.forEach(s => {
console.log(` [${s.filename}] chunk ${s.chunkIndex} (score: ${s.score})`);
console.log(` "${s.excerpt}"`);
});
}
main().catch(console.error);