def extract_tool_calls(response):
    """Extract tool calls from the new response format"""
    tool_calls = []
    if hasattr(response, 'output') and response.output:
        for output_item in response.output:
            if output_item.type == 'function_call':
                tool_calls.append(output_item.name)
    return tool_calls
# Initialize a new experiment
evaluation = langwatch.evaluation.init("tool-calling-evaluation")
# Create a DataFrame from the test data for easier processing
test_df = pd.DataFrame([
    {
        "query": test_item[0],
        "expected": [tool.__name__ for tool in test_item[1]]
    }
    for test_item in tests
])
# Wrap your loop with evaluation.loop(), and iterate as usual
results = []
for idx, row in evaluation.loop(test_df.iterrows()):
    # Run your model
    result = await process_user_query(row["query"])
    # Extract tool calls
    actual_tools = extract_tool_calls(result["response"])
    # Calculate metrics
    precision = calculate_precision(actual_tools, row["expected"])
    recall = calculate_recall(actual_tools, row["expected"])
    # Log metrics for this sample
    evaluation.log("precision", index=idx, score=precision)
    evaluation.log("recall", index=idx, score=recall)
    # Include additional data for debugging
    evaluation.log("tool_selection",
                  index=idx,
                  score=recall,  # Using recall as the primary score
                  data={
                      "query": row["query"],
                      "expected_tools": row["expected"],
                      "actual_tools": actual_tools,
                      "response_time": round(result["time"], 2)
                  })
    # Store results for local analysis
    results.append({
        "query": row["query"],
        "expected": row["expected"],
        "actual": actual_tools,
        "time": round(result["time"], 2),
        "precision": precision,
        "recall": recall
    })
# Create DataFrame for local analysis
df = pd.DataFrame(results)
df