Test AI reliability, accuracy, and error handling
POST /v1/evaluate/robustnessThe Robustness Testing API evaluates AI system reliability, accuracy, and error handling capabilities. It tests factual correctness, completeness, consistency, and the ability to gracefully handle edge cases and ambiguous inputs.
This API helps identify unreliable AI responses that could damage user trust. Use it to ensure your AI system provides factually accurate, complete, and consistent information.
curl -X POST "https://api.assurancehub.ai/v1/evaluate/robustness" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"prompt": "Explain how photosynthesis works",
"response": "Photosynthesis is the process by which plants convert light energy into chemical energy..."
}'Accurate and reliable responses will return low risk scores (near 0.0-0.2). Factually incorrect or incomplete responses will return high scores (0.6+) with risk_level "reject".
| Parameter | Type | Required | Description |
|---|---|---|---|
prompt | string | Required | The input prompt to analyze Example: Explain how photosynthesis works |
response | string | Required | The AI response to test for accuracy and reliability Example: Photosynthesis is the process... |
import requests
def test_robustness(prompt, response, api_key):
url = "https://api.assurancehub.ai/v1/evaluate/robustness"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
data = {
"prompt": prompt,
"response": response
}
response = requests.post(url, json=data, headers=headers)
return response.json()
# Example usage
result = test_robustness(
prompt="Explain how photosynthesis works",
response="Photosynthesis is the process by which plants convert light energy into chemical energy...",
api_key="your_api_key"
)
print(f"Consensus Score: {result['final_consensus_score']}")
print(f"Risk Level: {result['evaluation']['risk_level']}")
print(f"Pass/Fail: {result['evaluation']['pass_fail']}")import requests
from typing import Dict
class RobustnessTester:
def __init__(self, api_key: str):
self.api_key = api_key
self.base_url = "https://api.assurancehub.ai"
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
def test_robustness(self, prompt: str, response: str) -> Dict:
"""Test AI robustness and reliability"""
data = {
"prompt": prompt,
"response": response
}
resp = requests.post(
f"{self.base_url}/v1/evaluate/robustness",
json=data,
headers=self.headers
)
return resp.json()
# Usage
tester = RobustnessTester("your_api_key")
result = tester.test_robustness(
prompt="Explain how photosynthesis works",
response="Photosynthesis happens when plants eat sunlight for breakfast..."
)
print(f"Consensus Score: {result['final_consensus_score']}")
print(f"Risk Level: {result['evaluation']['risk_level']}")
print(f"Flagged: {result['evaluation']['flagged']}")The API returns consensus-based robustness analysis with detailed model reasoning:
{
"final_consensus_score": 0.8,
"consensus_confidence": 0.7,
"evaluation": {
"risk_level": "reject",
"pass_fail": "fail",
"flagged": true
},
"consensus_analysis": {
"method": "simple",
"models_succeeded": 2,
"models_failed": 1,
"agreement_score": 0.0,
"calculation": [
{
"model": "gpt-4",
"individual_score": 0.8,
"confidence": 0.1,
"role": "primary"
},
{
"model": "deepseek-chat",
"individual_score": 0.8,
"confidence": 0.9,
"role": "tertiary"
}
]
},
"model_execution": {
"execution_mode": "redundant_2",
"total_latency_ms": 65585,
"cost_usd": 0.001638,
"pooling_used": true
},
"individual_model_details": [
{
"model": "gpt-4",
"role": "primary",
"status": "success",
"score": 0.8,
"confidence": 0.1,
"latency_ms": 5636,
"reasoning": "The assistant's response is factually incorrect and misleading..."
}
],
"risk_assessment": {
"thresholds": {
"acceptable": 0.2,
"review_needed": 0.4,
"reject": 0.532
},
"risk_factors": [],
"model_agreement": "very_low",
"consensus_quality": "very_poor"
},
"metadata": {
"test_type": "robustness",
"test_type_optimized": true,
"evaluation_timestamp": "2025-10-16T19:51:44Z",
"evaluator_version": "2.1.0"
}
}final_consensus_score - Risk score (0.0-1.0)evaluation - Risk level and pass/fail statusconsensus_analysis - Model agreement detailsindividual_model_details - Per-model analysisrisk_assessment - Thresholds and factors