Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(evals): add status enum for evaluation scores #2169

Merged
merged 11 commits into from
Mar 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions genkit-tools/common/src/eval/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ export function enrichResultsWithScoring(
evaluator,
score: s.score,
scoreId: s.id,
status: s.status,
rationale: s.details?.reasoning,
error: s.error,
traceId: scoredSample.traceId,
Expand Down
3 changes: 3 additions & 0 deletions genkit-tools/common/src/types/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,13 @@ export type EvalInput = z.infer<typeof EvalInputSchema>;
export const EvalInputDatasetSchema = z.array(EvalInputSchema);
export type EvalInputDataset = z.infer<typeof EvalInputDatasetSchema>;

const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);

export const EvalMetricSchema = z.object({
evaluator: z.string(),
scoreId: z.string().optional(),
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
status: EvalStatusEnumSchema.optional(),
rationale: z.string().optional(),
error: z.string().optional(),
traceId: z.string().optional(),
Expand Down
5 changes: 3 additions & 2 deletions genkit-tools/common/src/types/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ export const BaseEvalDataPointSchema = BaseDataPointSchema.extend({
testCaseId: z.string(),
});
export type BaseEvalDataPoint = z.infer<typeof BaseEvalDataPointSchema>;

// Enum for Score Status
export const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);
/**
* Zod schema for evaluation score
*/
Expand All @@ -56,7 +57,7 @@ export const ScoreSchema = z.object({
.describe('Optional ID to differentiate different scores')
.optional(),
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
// TODO: use StatusSchema
status: EvalStatusEnumSchema.optional(),
error: z.string().optional(),
details: z
.object({
Expand Down
11 changes: 11 additions & 0 deletions genkit-tools/genkit-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,14 @@
"$ref": "#/$defs/EvalFnResponse"
}
},
"EvalStatusEnum": {
"type": "string",
"enum": [
"UNKNOWN",
"PASS",
"FAIL"
]
},
"Score": {
"type": "object",
"properties": {
Expand All @@ -427,6 +435,9 @@
"boolean"
]
},
"status": {
"$ref": "#/$defs/EvalStatusEnum"
},
"error": {
"type": "string"
},
Expand Down
8 changes: 8 additions & 0 deletions go/ai/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ type EvalRequest struct {

type EvalResponse []any

type EvalStatusEnum string

const (
EvalStatusEnumUNKNOWN EvalStatusEnum = "UNKNOWN"
EvalStatusEnumPASS EvalStatusEnum = "PASS"
EvalStatusEnumFAIL EvalStatusEnum = "FAIL"
)

type FinishReason string

const (
Expand Down
13 changes: 12 additions & 1 deletion js/ai/src/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ export const BaseEvalDataPointSchema = BaseDataPointSchema.extend({
});
export type BaseEvalDataPoint = z.infer<typeof BaseEvalDataPointSchema>;

const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);

/** Enum that indicates if an evaluation has passed or failed */
export enum EvalStatusEnum {
UNKNOWN = 'UNKNOWN',
PASS = 'PASS',
FAIL = 'FAIL',
}

export const ScoreSchema = z.object({
id: z
.string()
Expand All @@ -47,7 +56,7 @@ export const ScoreSchema = z.object({
)
.optional(),
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
// TODO: use StatusSchema
status: EvalStatusEnumSchema.optional(),
error: z.string().optional(),
details: z
.object({
Expand Down Expand Up @@ -218,8 +227,10 @@ export function defineEvaluator<
testCaseId: datapoint.testCaseId,
evaluation: {
error: `Evaluation of test case ${datapoint.testCaseId} failed: \n${(e as Error).stack}`,
status: EvalStatusEnum.FAIL,
},
});
// Throw to mark the span as failed.
throw e;
}
}
Expand Down
1 change: 1 addition & 0 deletions js/ai/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export {
} from './embedder.js';
export {
BaseDataPointSchema,
EvalStatusEnum,
evaluate,
evaluatorRef,
type EvalResponses,
Expand Down
1 change: 1 addition & 0 deletions js/genkit/src/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ export {
BaseEvalDataPointSchema,
EvalResponseSchema,
EvalResponsesSchema,
EvalStatusEnum,
EvaluatorInfoSchema,
ScoreSchema,
evaluatorRef,
Expand Down
123 changes: 82 additions & 41 deletions js/plugins/evaluators/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
* limitations under the License.
*/

import { EmbedderReference, Genkit, ModelReference, z } from 'genkit';
import { Genkit, z } from 'genkit';
import {
BaseEvalDataPoint,
EvalResponse,
EvalStatusEnum,
Score,
evaluatorRef,
} from 'genkit/evaluator';
Expand All @@ -30,22 +31,18 @@ import {
maliciousnessScore,
regexp,
} from './metrics/index.js';
import { GenkitMetric } from './types.js';
export { GenkitMetric };
import {
AnswerRelevancyGenkitMetricConfig,
GenkitMetric,
ResolvedConfig,
isGenkitMetricConfig,
type GenkitMetricConfig,
type PluginOptions,
} from './types.js';
export { GenkitMetric, type GenkitMetricConfig, type PluginOptions };

const PLUGIN_NAME = 'genkitEval';

export interface PluginOptions<
ModelCustomOptions extends z.ZodTypeAny,
EmbedderCustomOptions extends z.ZodTypeAny,
> {
metrics?: Array<GenkitMetric>;
judge?: ModelReference<ModelCustomOptions>;
judgeConfig?: z.infer<ModelCustomOptions>;
embedder?: EmbedderReference<EmbedderCustomOptions>;
embedderOptions?: z.infer<EmbedderCustomOptions>;
}

/**
* Reference to the Genkit evaluator for a specified metric
*/
Expand Down Expand Up @@ -75,15 +72,16 @@ export function genkitEval<

export default genkitEval;

function hasMetric(arr: GenkitMetric[] | undefined, metric: GenkitMetric) {
return arr?.some((m) => m === metric);
}

function fillScores(dataPoint: BaseEvalDataPoint, score: Score): EvalResponse {
return {
testCaseId: dataPoint.testCaseId,
evaluation: score,
};
function fillScores(
dataPoint: BaseEvalDataPoint,
score: Score,
statusOverrideFn?: (args: { score: Score }) => EvalStatusEnum
): EvalResponse {
let status = score.status;
if (statusOverrideFn) {
status = statusOverrideFn({ score });
}
return { testCaseId: dataPoint.testCaseId, evaluation: { ...score, status } };
}

/**
Expand All @@ -96,23 +94,35 @@ export function genkitEvaluators<
ai: Genkit,
params: PluginOptions<ModelCustomOptions, EmbedderCustomOptions>
) {
let { metrics, judge, judgeConfig, embedder, embedderOptions } = params;
if (!metrics) {
metrics = [GenkitMetric.MALICIOUSNESS, GenkitMetric.FAITHFULNESS];
} else if (!embedder && hasMetric(metrics, GenkitMetric.ANSWER_RELEVANCY)) {
throw new Error('Embedder must be specified if computing answer relvancy');
let { metrics } = params;
if (metrics.length === 0) {
throw new Error('No metrics configured in genkitEval plugin');
}
return metrics.map((metric) => {
switch (metric) {
const {
type,
judge,
judgeConfig,
embedder,
embedderOptions,
statusOverrideFn,
} = resolveConfig(metric, params);
const evaluator = `${PLUGIN_NAME}/${type.toLocaleLowerCase()}`;
switch (type) {
case GenkitMetric.ANSWER_RELEVANCY: {
if (!judge) {
throw new Error(
'Judge llms must be specified if computing answer relvancy'
);
}
if (!embedder) {
throw new Error(
'Embedder must be specified if computing answer relvancy'
);
}
return ai.defineEvaluator(
{
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
name: evaluator,
displayName: 'Answer Relevancy',
definition:
'Assesses how pertinent the generated answer is to the given prompt',
Expand All @@ -126,7 +136,7 @@ export function genkitEvaluators<
judgeConfig,
embedderOptions
);
return fillScores(datapoint, answerRelevancy);
return fillScores(datapoint, answerRelevancy, statusOverrideFn);
}
);
}
Expand All @@ -138,7 +148,7 @@ export function genkitEvaluators<
}
return ai.defineEvaluator(
{
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
name: evaluator,
displayName: 'Faithfulness',
definition:
'Measures the factual consistency of the generated answer against the given context',
Expand All @@ -150,7 +160,7 @@ export function genkitEvaluators<
datapoint,
judgeConfig
);
return fillScores(datapoint, faithfulness);
return fillScores(datapoint, faithfulness, statusOverrideFn);
}
);
}
Expand All @@ -162,7 +172,7 @@ export function genkitEvaluators<
}
return ai.defineEvaluator(
{
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
name: evaluator,
displayName: 'Maliciousness',
definition:
'Measures whether the generated output intends to deceive, harm, or exploit',
Expand All @@ -174,14 +184,14 @@ export function genkitEvaluators<
datapoint,
judgeConfig
);
return fillScores(datapoint, maliciousness);
return fillScores(datapoint, maliciousness, statusOverrideFn);
}
);
}
case GenkitMetric.REGEX: {
return ai.defineEvaluator(
{
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
name: evaluator,
displayName: 'RegExp',
definition: 'Tests output against the regexp provided as reference',
},
Expand All @@ -193,29 +203,60 @@ export function genkitEvaluators<
case GenkitMetric.DEEP_EQUAL: {
return ai.defineEvaluator(
{
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
displayName: 'Deep Equal',
name: evaluator,
displayName: 'Deep Equals',
definition:
'Tests equality of output against the provided reference',
},
async (datapoint: BaseEvalDataPoint) => {
return fillScores(datapoint, await deepEqual(datapoint));
return fillScores(
datapoint,
await deepEqual(datapoint),
statusOverrideFn
);
}
);
}
case GenkitMetric.JSONATA: {
return ai.defineEvaluator(
{
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
name: evaluator,
displayName: 'JSONata',
definition:
'Tests JSONata expression (provided in reference) against output',
},
async (datapoint: BaseEvalDataPoint) => {
return fillScores(datapoint, await jsonata(datapoint));
return fillScores(
datapoint,
await jsonata(datapoint),
statusOverrideFn
);
}
);
}
}
});
}

function resolveConfig<M extends z.ZodTypeAny, E extends z.ZodTypeAny>(
metric: GenkitMetricConfig<M, E>,
params: PluginOptions<M, E>
): ResolvedConfig<M, E> {
if (isGenkitMetricConfig(metric)) {
return {
type: metric.type,
statusOverrideFn: metric.statusOverrideFn,
judge: metric.judge ?? params.judge,
judgeConfig: metric.judgeConfig ?? params.judgeConfig,
embedder:
metric.type === GenkitMetric.ANSWER_RELEVANCY
? (metric as AnswerRelevancyGenkitMetricConfig<M, E>).embedder
: undefined,
embedderOptions:
metric.type === GenkitMetric.ANSWER_RELEVANCY
? (metric as AnswerRelevancyGenkitMetricConfig<M, E>).embedderOptions
: undefined,
} as ResolvedConfig<M, E>;
}
return { type: metric, ...params };
}
6 changes: 4 additions & 2 deletions js/plugins/evaluators/src/metrics/answer_relevancy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import similarity from 'compute-cosine-similarity';
import { Genkit, ModelArgument, z } from 'genkit';
import { EmbedderArgument } from 'genkit/embedder';
import { BaseEvalDataPoint, Score } from 'genkit/evaluator';
import { BaseEvalDataPoint, EvalStatusEnum, Score } from 'genkit/evaluator';
import path from 'path';
import { getDirName, loadPromptFile, renderText } from './helper.js';

Expand Down Expand Up @@ -103,11 +103,13 @@ export async function answerRelevancyScore<
: answered
? 'Cosine similarity'
: 'Cosine similarity with penalty for insufficient answer';
const finalScore = adjustedScore * (isNonCommittal ? 0 : 1);
return {
score: adjustedScore * (isNonCommittal ? 0 : 1),
score: finalScore,
details: {
reasoning,
},
status: finalScore > 0.5 ? EvalStatusEnum.PASS : EvalStatusEnum.FAIL,
};
} catch (err) {
console.debug(
Expand Down
Loading
Loading