AI
RAG Patterns
Ai Ml advanced v1.0.0
RAG Patterns
Overview
Retrieval-Augmented Generation (RAG) combines document retrieval with LLM generation for grounded, factual responses. This skill covers chunking strategies, retrieval optimization, and context assembly.
Key Concepts
RAG Pipeline Architecture
┌─────────────────────────────────────────────────────────────┐
│ RAG Pipeline Architecture │
├─────────────────────────────────────────────────────────────┤
│ │
│ Indexing Pipeline: │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ │ │
│ │ Documents → Chunking → Embedding → Vector Store │ │
│ │ │ │ │ │ │ │
│ │ ▼ ▼ ▼ ▼ │ │
│ │ [PDF] [Chunks] [Vectors] [Index] │ │
│ │ [MD] [500 tok] [1536 dim] [HNSW] │ │
│ │ [HTML] │ │
│ │ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
│ Query Pipeline: │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ │ │
│ │ Query → Embed → Search → Rerank → Context → LLM │ │
│ │ │ │ │ │ │ │ │ │
│ │ ▼ ▼ ▼ ▼ ▼ ▼ │ │
│ │ "How [Vec] [Top-K [Top-3 [Prompt [Ans] │ │
│ │ do" matches] relevant] + Docs] │ │
│ │ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
│ Advanced Techniques: │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ • Hybrid search (vector + keyword) │ │
│ │ • Query expansion and rewriting │ │
│ │ • Hierarchical retrieval │ │
│ │ • Parent-child chunking │ │
│ │ • Cross-encoder reranking │ │
│ │ • Contextual compression │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
Best Practices
1. Optimize Chunk Size
Balance context vs granularity (typically 256-512 tokens).
2. Include Chunk Overlap
20-50 token overlap prevents boundary issues.
3. Use Reranking
Cross-encoder reranking improves relevance.
4. Cite Sources
Include citations for verifiability.
5. Handle No-Results
Gracefully handle when no relevant documents found.
Code Examples
Example 1: Document Chunking
@Service
public class ChunkingService {
private final int DEFAULT_CHUNK_SIZE = 500; // tokens
private final int DEFAULT_OVERLAP = 50;
public List<Chunk> chunkDocument(Document document, ChunkingStrategy strategy) {
return switch (strategy) {
case FIXED_SIZE -> fixedSizeChunking(document);
case SEMANTIC -> semanticChunking(document);
case RECURSIVE -> recursiveChunking(document);
case SENTENCE -> sentenceChunking(document);
};
}
/**
* Fixed size with overlap
*/
private List<Chunk> fixedSizeChunking(Document document) {
List<Chunk> chunks = new ArrayList<>();
String text = document.getContent();
List<Integer> tokenBoundaries = tokenize(text);
int start = 0;
int chunkIndex = 0;
while (start < tokenBoundaries.size()) {
int end = Math.min(start + DEFAULT_CHUNK_SIZE, tokenBoundaries.size());
int startChar = tokenBoundaries.get(start);
int endChar = end < tokenBoundaries.size() ?
tokenBoundaries.get(end) : text.length();
String chunkText = text.substring(startChar, endChar);
chunks.add(Chunk.builder()
.id(document.getId() + "_" + chunkIndex)
.documentId(document.getId())
.content(chunkText)
.startOffset(startChar)
.endOffset(endChar)
.tokenCount(end - start)
.metadata(Map.of(
"chunk_index", chunkIndex,
"total_chunks", -1, // Set after all chunks created
"source", document.getSource()
))
.build());
start += DEFAULT_CHUNK_SIZE - DEFAULT_OVERLAP;
chunkIndex++;
}
// Update total chunks
int total = chunks.size();
chunks.forEach(c -> c.getMetadata().put("total_chunks", total));
return chunks;
}
/**
* Semantic chunking - split on topic boundaries
*/
private List<Chunk> semanticChunking(Document document) {
List<String> paragraphs = splitIntoParagraphs(document.getContent());
List<float[]> embeddings = embeddingService.embedBatch(paragraphs);
List<Chunk> chunks = new ArrayList<>();
List<String> currentGroup = new ArrayList<>();
int currentTokens = 0;
for (int i = 0; i < paragraphs.size(); i++) {
String para = paragraphs.get(i);
int paraTokens = countTokens(para);
// Check semantic similarity with previous paragraph
boolean isSemanticallyDifferent = i > 0 &&
cosineSimilarity(embeddings.get(i), embeddings.get(i-1)) < 0.7;
// Split if topic change or size limit
if (isSemanticallyDifferent || currentTokens + paraTokens > DEFAULT_CHUNK_SIZE) {
if (!currentGroup.isEmpty()) {
chunks.add(createChunk(document, String.join("\n\n", currentGroup)));
currentGroup.clear();
currentTokens = 0;
}
}
currentGroup.add(para);
currentTokens += paraTokens;
}
if (!currentGroup.isEmpty()) {
chunks.add(createChunk(document, String.join("\n\n", currentGroup)));
}
return chunks;
}
/**
* Recursive chunking with hierarchy preservation
*/
private List<Chunk> recursiveChunking(Document document) {
List<String> separators = List.of("\n\n\n", "\n\n", "\n", ". ", " ");
return recursiveSplit(document.getContent(), separators, 0, document);
}
private List<Chunk> recursiveSplit(String text, List<String> separators,
int level, Document document) {
if (countTokens(text) <= DEFAULT_CHUNK_SIZE) {
return List.of(createChunk(document, text));
}
if (level >= separators.size()) {
// Hard split at token boundary
return fixedSizeChunking(new Document(document.getId(), text, document.getSource()));
}
String separator = separators.get(level);
String[] parts = text.split(Pattern.quote(separator));
List<Chunk> result = new ArrayList<>();
StringBuilder current = new StringBuilder();
for (String part : parts) {
if (countTokens(current + separator + part) > DEFAULT_CHUNK_SIZE) {
if (current.length() > 0) {
result.addAll(recursiveSplit(current.toString(), separators, level + 1, document));
current = new StringBuilder();
}
}
if (current.length() > 0) current.append(separator);
current.append(part);
}
if (current.length() > 0) {
result.addAll(recursiveSplit(current.toString(), separators, level + 1, document));
}
return result;
}
enum ChunkingStrategy {
FIXED_SIZE, SEMANTIC, RECURSIVE, SENTENCE
}
}
Example 2: Retrieval Pipeline
@Service
public class RetrievalService {
private final VectorStore vectorStore;
private final EmbeddingService embeddingService;
private final KeywordSearchService keywordSearch;
private final CrossEncoderReranker reranker;
/**
* Hybrid search: combine vector and keyword results
*/
public List<RetrievalResult> hybridSearch(String query, RetrievalConfig config) {
// Parallel search
CompletableFuture<List<VectorMatch>> vectorFuture =
CompletableFuture.supplyAsync(() -> vectorSearch(query, config.getTopK() * 2));
CompletableFuture<List<KeywordMatch>> keywordFuture =
CompletableFuture.supplyAsync(() -> keywordSearch(query, config.getTopK() * 2));
List<VectorMatch> vectorResults = vectorFuture.join();
List<KeywordMatch> keywordResults = keywordFuture.join();
// Reciprocal Rank Fusion
Map<String, Double> fusedScores = new HashMap<>();
int k = 60; // RRF constant
for (int i = 0; i < vectorResults.size(); i++) {
String id = vectorResults.get(i).getId();
fusedScores.merge(id, 1.0 / (k + i + 1), Double::sum);
}
for (int i = 0; i < keywordResults.size(); i++) {
String id = keywordResults.get(i).getId();
fusedScores.merge(id, 1.0 / (k + i + 1), Double::sum);
}
// Get top candidates for reranking
List<String> candidateIds = fusedScores.entrySet().stream()
.sorted(Map.Entry.<String, Double>comparingByValue().reversed())
.limit(config.getRerankCandidates())
.map(Map.Entry::getKey)
.toList();
// Fetch chunks
List<Chunk> candidates = chunkRepository.findAllById(candidateIds);
// Rerank with cross-encoder
if (config.isUseReranking()) {
List<RerankedResult> reranked = reranker.rerank(query, candidates);
return reranked.stream()
.limit(config.getTopK())
.map(this::toRetrievalResult)
.toList();
}
return candidates.stream()
.limit(config.getTopK())
.map(this::toRetrievalResult)
.toList();
}
/**
* Query expansion for better recall
*/
public List<RetrievalResult> searchWithQueryExpansion(String query, RetrievalConfig config) {
// Generate alternative queries
List<String> expandedQueries = expandQuery(query);
// Search with all queries
Set<String> seenIds = new HashSet<>();
List<VectorMatch> allMatches = new ArrayList<>();
for (String q : expandedQueries) {
List<VectorMatch> matches = vectorSearch(q, config.getTopK());
for (VectorMatch match : matches) {
if (seenIds.add(match.getId())) {
allMatches.add(match);
}
}
}
// Rerank combined results
List<Chunk> chunks = chunkRepository.findAllById(
allMatches.stream().map(VectorMatch::getId).toList()
);
List<RerankedResult> reranked = reranker.rerank(query, chunks);
return reranked.stream()
.limit(config.getTopK())
.map(this::toRetrievalResult)
.toList();
}
private List<String> expandQuery(String query) {
String prompt = """
Generate 3 alternative search queries for: "%s"
Return as JSON array of strings.
Make them semantically similar but use different words.
""".formatted(query);
CompletionResponse response = llmClient.complete(
CompletionRequest.builder()
.model("gpt-3.5-turbo")
.messages(List.of(Message.user(prompt)))
.temperature(0.7)
.build()
);
List<String> alternatives = parseJsonArray(response.getContent());
alternatives.add(0, query); // Include original
return alternatives;
}
}
Example 3: Context Assembly
@Service
public class ContextAssemblyService {
private final TokenCounter tokenCounter;
private final int MAX_CONTEXT_TOKENS = 4000;
/**
* Assemble context with source citations
*/
public AssembledContext assembleContext(
String query,
List<RetrievalResult> retrievalResults,
ContextConfig config) {
List<ContextChunk> contextChunks = new ArrayList<>();
int usedTokens = 0;
for (RetrievalResult result : retrievalResults) {
int chunkTokens = tokenCounter.count(result.getContent());
if (usedTokens + chunkTokens > MAX_CONTEXT_TOKENS) {
// Try to include partial content
String truncated = truncateToTokenLimit(
result.getContent(),
MAX_CONTEXT_TOKENS - usedTokens
);
if (truncated.length() > 100) {
contextChunks.add(createContextChunk(result, truncated));
}
break;
}
contextChunks.add(createContextChunk(result, result.getContent()));
usedTokens += chunkTokens;
}
// Format context with citations
StringBuilder formattedContext = new StringBuilder();
formattedContext.append("Use the following context to answer the question.\n");
formattedContext.append("Cite sources using [Source N] notation.\n\n");
for (int i = 0; i < contextChunks.size(); i++) {
ContextChunk chunk = contextChunks.get(i);
formattedContext.append(String.format("[Source %d] %s\n",
i + 1, chunk.getSource()));
formattedContext.append(chunk.getContent());
formattedContext.append("\n\n---\n\n");
}
return AssembledContext.builder()
.formattedContext(formattedContext.toString())
.chunks(contextChunks)
.totalTokens(usedTokens)
.build();
}
/**
* Parent-child context: retrieve child, include parent for context
*/
public AssembledContext assembleHierarchicalContext(
String query,
List<RetrievalResult> childResults) {
List<ContextChunk> contextChunks = new ArrayList<>();
Set<String> includedParents = new HashSet<>();
for (RetrievalResult child : childResults) {
String parentId = child.getMetadata().get("parent_id");
// Include parent for broader context
if (parentId != null && !includedParents.contains(parentId)) {
Chunk parent = chunkRepository.findById(parentId).orElse(null);
if (parent != null) {
contextChunks.add(ContextChunk.builder()
.content(parent.getContent())
.source(parent.getSource())
.relevance(child.getScore() * 0.8) // Slightly lower relevance
.isParent(true)
.build());
includedParents.add(parentId);
}
}
// Include the matched child
contextChunks.add(ContextChunk.builder()
.content(child.getContent())
.source(child.getSource())
.relevance(child.getScore())
.highlightedMatch(true)
.build());
}
// Sort by relevance
contextChunks.sort((a, b) -> Double.compare(b.getRelevance(), a.getRelevance()));
return assembleFromChunks(contextChunks);
}
}
@Data
@Builder
class AssembledContext {
private String formattedContext;
private List<ContextChunk> chunks;
private int totalTokens;
private Map<Integer, String> citationMap; // Citation number -> source
}
@Data
@Builder
class ContextChunk {
private String content;
private String source;
private double relevance;
private boolean isParent;
private boolean highlightedMatch;
}
Example 4: RAG Generation
@Service
public class RagGenerationService {
private final RetrievalService retrievalService;
private final ContextAssemblyService contextService;
private final LlmClient llmClient;
public RagResponse generateAnswer(String query, RagConfig config) {
// Retrieve relevant documents
List<RetrievalResult> retrievalResults = retrievalService.hybridSearch(
query,
RetrievalConfig.builder()
.topK(config.getNumDocuments())
.useReranking(true)
.rerankCandidates(20)
.build()
);
// Check if we have relevant context
if (retrievalResults.isEmpty() ||
retrievalResults.get(0).getScore() < config.getMinRelevanceScore()) {
return RagResponse.builder()
.answer("I don't have enough relevant information to answer this question.")
.confidence(0.0)
.sources(List.of())
.needsMoreContext(true)
.build();
}
// Assemble context
AssembledContext context = contextService.assembleContext(
query, retrievalResults, ContextConfig.defaults());
// Generate answer
String systemPrompt = """
You are a helpful assistant that answers questions based on the provided context.
Guidelines:
- Only use information from the provided context
- If the context doesn't contain the answer, say so
- Cite sources using [Source N] notation
- Be concise but complete
""";
String userPrompt = """
Context:
%s
Question: %s
Answer the question based on the context above. Include citations.
""".formatted(context.getFormattedContext(), query);
CompletionResponse response = llmClient.complete(
CompletionRequest.builder()
.model(config.getModel())
.messages(List.of(
Message.system(systemPrompt),
Message.user(userPrompt)
))
.temperature(0.1)
.maxTokens(config.getMaxResponseTokens())
.build()
);
// Extract citations from response
List<Citation> citations = extractCitations(
response.getContent(), context.getChunks());
return RagResponse.builder()
.answer(response.getContent())
.confidence(calculateConfidence(retrievalResults))
.sources(citations)
.tokensUsed(response.getUsage().getTotalTokens())
.build();
}
private List<Citation> extractCitations(String answer, List<ContextChunk> chunks) {
List<Citation> citations = new ArrayList<>();
Pattern pattern = Pattern.compile("\\[Source (\\d+)\\]");
Matcher matcher = pattern.matcher(answer);
Set<Integer> citedSources = new HashSet<>();
while (matcher.find()) {
int sourceNum = Integer.parseInt(matcher.group(1));
citedSources.add(sourceNum);
}
for (int sourceNum : citedSources) {
if (sourceNum > 0 && sourceNum <= chunks.size()) {
ContextChunk chunk = chunks.get(sourceNum - 1);
citations.add(Citation.builder()
.sourceNumber(sourceNum)
.source(chunk.getSource())
.excerpt(truncate(chunk.getContent(), 200))
.build());
}
}
return citations;
}
}
@Data
@Builder
class RagResponse {
private String answer;
private double confidence;
private List<Citation> sources;
private int tokensUsed;
private boolean needsMoreContext;
}
@Data
@Builder
class Citation {
private int sourceNumber;
private String source;
private String excerpt;
private String url;
}
Example 5: RAG Evaluation
@Service
public class RagEvaluationService {
private final RagGenerationService ragService;
private final LlmClient evaluatorLlm;
public RagEvaluationReport evaluate(List<RagTestCase> testCases) {
List<RagEvaluationResult> results = testCases.parallelStream()
.map(this::evaluateTestCase)
.toList();
return RagEvaluationReport.builder()
.results(results)
.averageFaithfulness(average(results, RagEvaluationResult::getFaithfulness))
.averageRelevance(average(results, RagEvaluationResult::getRelevance))
.averageContextRecall(average(results, RagEvaluationResult::getContextRecall))
.build();
}
private RagEvaluationResult evaluateTestCase(RagTestCase testCase) {
RagResponse response = ragService.generateAnswer(
testCase.getQuery(), RagConfig.defaults());
// Evaluate faithfulness (is answer grounded in context?)
double faithfulness = evaluateFaithfulness(
response.getAnswer(),
response.getSources().stream().map(Citation::getExcerpt).toList()
);
// Evaluate answer relevance
double relevance = evaluateRelevance(
testCase.getQuery(),
response.getAnswer()
);
// Evaluate context recall (did we retrieve relevant docs?)
double contextRecall = evaluateContextRecall(
testCase.getRelevantDocIds(),
response.getSources().stream().map(Citation::getSource).toList()
);
// Compare to expected answer if available
Double correctness = null;
if (testCase.getExpectedAnswer() != null) {
correctness = evaluateCorrectness(
testCase.getExpectedAnswer(),
response.getAnswer()
);
}
return RagEvaluationResult.builder()
.query(testCase.getQuery())
.answer(response.getAnswer())
.faithfulness(faithfulness)
.relevance(relevance)
.contextRecall(contextRecall)
.correctness(correctness)
.build();
}
private double evaluateFaithfulness(String answer, List<String> contexts) {
String prompt = """
Given the following answer and source contexts, evaluate if the answer
is faithful to the contexts (i.e., all claims are supported by the sources).
Answer: %s
Contexts:
%s
Score from 0.0 (completely unfaithful) to 1.0 (completely faithful).
Return just the number.
""".formatted(answer, String.join("\n---\n", contexts));
CompletionResponse response = evaluatorLlm.complete(
CompletionRequest.builder()
.model("gpt-4")
.messages(List.of(Message.user(prompt)))
.temperature(0.0)
.build()
);
return Double.parseDouble(response.getContent().trim());
}
}
Anti-Patterns
❌ Fixed Chunk Sizes Without Context
Consider document structure when chunking.
❌ Retrieving Too Many or Too Few Documents
Balance between coverage and noise.