Anti-Hallucination & Content Safety Guardrails

The final quality layer validates every AI response before it reaches users, detecting and preventing hallucinations.

Purpose

Validation guardrails ensure:

No invented content (plots, recipes, characters)
No fake titles or authors
No category mismatches
Grounded in actual data
Estonian cultural accuracy

Validation Pipeline

Implementation

Location: app/api/chat/system-prompt.ts:398-567

Extract Valid Content

function extractValidContent(products: ValidatedProduct[]): Set<string> {
  const validContent = new Set<string>();
  
  products.forEach(product => {
    // Add product title
    validContent.add(product.title.toLowerCase());
    
    // Add author names
    if (product.authors) {
      product.authors.split(',').forEach(author => {
        validContent.add(author.trim().toLowerCase());
      });
    }
    
    // Extract names from descriptions
    if (product.description) {
      const names = product.description.match(/\b[A-ZÄÖÜÕ][a-zäöüõ]+/g) || [];
      names.forEach(name => {
        if (name.length > 2) {
          validContent.add(name.toLowerCase());
        }
      });
    }
  });
  
  return validContent;
}

Pattern Detection (Estonian-Specific)

const estonianPatterns = {
  // Proper nouns (capitalized words)
  properNouns: /\b[A-ZÄÖÜÕ][a-zäöüõ]+(?:\s+[A-ZÄÖÜÕ][a-zäöüõ]+)*\b/g,
  
  // Quoted text (potential titles)
  quotedText: /"([^"]+)"/g,
  
  // Author names (1-3 words)
  authorNames: /\b[A-ZÄÖÜÕ][a-zäöüõ]+(?:\s+[A-ZÄÖÜÕ][a-zäöüõ]+){1,2}\b/g,
  
  // Character titles
  characterTitles: /\b(?:inspektor|komissar|uurija|doktor|professor)\s+[A-ZÄÖÜÕ][a-zäöüõ]+/gi,
  
  // Plot elements (should NOT appear)
  plotElements: /\b(?:juhtum|mõrv|kuritegu|uurimine|kahtlane|ohver)\b/gi
};

Confidence Scoring

function calculateMentionConfidence(
  mention: string,
  validContent: Set<string>,
  category: 'title' | 'author' | 'content'
): number {
  const lower = mention.toLowerCase();
  
  // Exact match → 1.0
  if (validContent.has(lower)) return 1.0;
  
  // Partial match → 0.7
  for (const valid of validContent) {
    if (valid.includes(lower) || lower.includes(valid)) {
      return 0.7;
    }
  }
  
  // Character/plot patterns → 0.3 (from descriptions)
  if (category === 'content') {
    if (isKnownPattern(mention)) return 0.3;
  }
  
  // No match → 0.0 (hallucination!)
  return 0.0;
}

Context-Aware Filters

Category Mismatch Detection

// User asks for cooking books
const isCookingQuery = /kok|itaalia|retsept|toit|cook|recipe/i
  .test(userContext);

if (isCookingQuery) {
  validatedProducts.forEach(product => {
    const productInfo = `${product.title} ${product.category}`.toLowerCase();
    const isCookingRelated = /kok|retsept|toit|cook|recipe|kitchen/i
      .test(productInfo);
    
    if (!isCookingRelated) {
      // Flag mismatch
      invalidMentions.push({
        text: product.title,
        category: 'content_discussion',
        confidence: 0.8,
        context: 'Product does not match cooking context'
      });
    }
  });
}

Example:

User: "kokaraamatud"
Product: "Helde Puu" (children's book)
→ MISMATCH: Not cooking-related
→ High confidence violation

Severity Levels

High Severity (Reject)

Criteria: Confidence > 0.7 violations

{
  isValid: false,
  severity: 'high',
  invalidMentions: [
    {
      text: '"Invented Book Title"',
      category: 'title_hallucination',
      confidence: 0.9
    }
  ],
  validatedResponse: FALLBACK_MESSAGE
}

Action: Replace with fallback

Medium Severity (Log & Allow)

Criteria: 0.4 ≤ Confidence ≤ 0.7

{
  isValid: true,
  severity: 'medium',
  invalidMentions: [
    {
      text: 'Ambiguous Name',
      category: 'author_hallucination',
      confidence: 0.5
    }
  ],
  validatedResponse: response  // Allow but log
}

Action: Log for review, monitor trend

Low Severity (Accept)

Criteria: No high-confidence violations

{
  isValid: true,
  severity: 'low',
  invalidMentions: [],
  validatedResponse: response
}

Action: Accept as-is

Upstream Guardrails

Streaming Utils Strip Tool References

// Remove tool call syntax from text
function stripToolReferences(text: string): string {
  return text
    .replace(/\{tool:.*?\}/g, '')
    .replace(/```json\n.*?\n```/gs, '')
    .trim();
}

Hallucination Pattern Detection

const HALLUCINATION_PATTERNS = [
  /raamatus juhtub/i,           // "in the book happens"
  /peategelane on/i,            // "main character is"
  /lugu räägib/i,               // "story tells"
  /autor kirjeldab/i,           // "author describes"
  /retsept sisaldab/i           // "recipe contains"
];

function containsHallucinationPattern(text: string): boolean {
  return HALLUCINATION_PATTERNS.some(pattern => pattern.test(text));
}

Skeleton/Text Ordering

// Ensure skeleton shows before text floods
if (toolsExpected && !showSkeleton) {
  const thresholdMet = 
    accumulatedText.length >= 160 ||
    Date.now() - firstEventTime >= 800;
  
  if (thresholdMet) {
    setShowSkeleton(true);  // Show placeholders
  }
}

Benefit: User sees loading state, not "thinking" filler

Fallback Responses

When validation fails:

const FALLBACK_RESPONSES = {
  et: {
    noProducts: 'Vabandust, hetkel ei ole sobivaid tooteid saadaval.',
    hallucination: 'Vabandust, ei saa anda soovitusi. Palun täpsusta päringut.',
    categoryMismatch: 'Vabandust, leidsime ainult mittesobivaid tooteid.'
  },
  
  en: {
    noProducts: 'Sorry, no suitable products available.',
    hallucination: 'Sorry, cannot provide recommendations. Please refine query.',
    categoryMismatch: 'Sorry, only found unsuitable products.'
  }
};

Testing

describe('Validation Guardrails', () => {
  it('detects title hallucination', () => {
    const response = 'Soovitan raamatut "Invented Title"';
    const products = [{ title: 'Real Book' }];
    
    const result = validateAIResponse(response, products);
    
    expect(result.isValid).toBe(false);
    expect(result.severity).toBe('high');
  });
  
  it('detects category mismatch', () => {
    const userContext = 'kokaraamatud';
    const response = 'Helde Puu on suurepärane kokaraamat';
    const products = [{ 
      title: 'Helde Puu', 
      category: 'Lastekirjandus' 
    }];
    
    const result = validateAIResponse(response, products, userContext);
    
    expect(result.invalidMentions.length).toBeGreaterThan(0);
  });
  
  it('accepts valid responses', () => {
    const response = 'Soovitan raamatut **Harry Potter**';
    const products = [{ title: 'Harry Potter' }];
    
    const result = validateAIResponse(response, products);
    
    expect(result.isValid).toBe(true);
    expect(result.confidence).toBeGreaterThan(0.9);
  });
});

Known Gaps

Language Bias

Issue: Validation patterns are Estonian-heavy

Impact: English-only flows may be over-flagged

Mitigation:

// Expand English patterns
const englishPatterns = {
  properNouns: /\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/g,
  characterTitles: /\b(?:detective|inspector|doctor|professor)\s+[A-Z][a-z]+/gi
};

Roadmap: Add comprehensive English pattern library

Monitoring

{
  // Validation metrics
  validationRate: 1.0,             // 100% of responses validated
  hallucinationRate: number,       // Should be &lt;2%
  averageConfidence: number,       // Should be >0.8
  
  // Severity distribution
  highSeverity: number,            // Should be &lt;1%
  mediumSeverity: number,          // Track trend
  lowSeverity: number,             // Majority
  
  // Category-specific
  titleHallucinations: number,
  authorHallucinations: number,
  contentDiscussions: number,
  categoryMismatches: number
}

Response Guardrails - Previous phase
Testing & Observability - Next phase
Response Validation - Detailed validation logic

Purpose​

Validation Pipeline​

Implementation​

Extract Valid Content​

Pattern Detection (Estonian-Specific)​

Confidence Scoring​

Context-Aware Filters​

Category Mismatch Detection​

Severity Levels​

High Severity (Reject)​

Medium Severity (Log & Allow)​

Low Severity (Accept)​

Upstream Guardrails​

Streaming Utils Strip Tool References​

Hallucination Pattern Detection​

Skeleton/Text Ordering​

Fallback Responses​

Testing​

Known Gaps​

Language Bias​

Monitoring​

Related Documentation​