launchdarkly · mattrmc1 · Jun 22, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
@@ -30,8 +30,9 @@ public final class AIAgentConfig extends AIConfig {
       String instructions,
       JudgeConfiguration judgeConfiguration,
       Map<String, Tool> tools,
-      Supplier<LDAIConfigTracker> trackerFactory) {
-    super(key, enabled, Mode.AGENT, model, provider, trackerFactory);
+      Supplier<LDAIConfigTracker> trackerFactory,
+      Evaluator evaluator) {
+    super(key, enabled, Mode.AGENT, model, provider, trackerFactory, evaluator);
     this.instructions = instructions;
     this.judgeConfiguration = judgeConfiguration;
     this.tools = tools == null ? null : Collections.unmodifiableMap(tools);

@@ -32,8 +32,9 @@ public final class AICompletionConfig extends AIConfig {
       List<Message> messages,
       JudgeConfiguration judgeConfiguration,
       Map<String, Tool> tools,
-      Supplier<LDAIConfigTracker> trackerFactory) {
-    super(key, enabled, Mode.COMPLETION, model, provider, trackerFactory);
+      Supplier<LDAIConfigTracker> trackerFactory,
+      Evaluator evaluator) {
+    super(key, enabled, Mode.COMPLETION, model, provider, trackerFactory, evaluator);
     this.messages = messages == null ? null : Collections.unmodifiableList(messages);
     this.judgeConfiguration = judgeConfiguration;
     this.tools = tools == null ? null : Collections.unmodifiableMap(tools);

@@ -24,20 +24,23 @@ public abstract class AIConfig {
   private final Model model;
   private final Provider provider;
   private final Supplier<LDAIConfigTracker> trackerFactory;
+  private final Evaluator evaluator;
 
   AIConfig(
       String key,
       boolean enabled,
       Mode mode,
       Model model,
       Provider provider,
-      Supplier<LDAIConfigTracker> trackerFactory) {
+      Supplier<LDAIConfigTracker> trackerFactory,
+      Evaluator evaluator) {
     this.key = key;
     this.enabled = enabled;
     this.mode = mode;
     this.model = model;
     this.provider = provider;
     this.trackerFactory = Objects.requireNonNull(trackerFactory, "trackerFactory");
+    this.evaluator = Objects.requireNonNull(evaluator, "evaluator");
   }
 
   /**
@@ -102,4 +105,17 @@ public Provider getProvider() {
   public LDAIConfigTracker createTracker() {
     return trackerFactory.get();
   }
+
+  /**
+   * Returns the evaluator that coordinates judge execution for this configuration.
+   * <p>
+   * For {@link AIJudgeConfig} this is always {@link Evaluator#noop()}. For
+   * {@link AICompletionConfig} and {@link AIAgentConfig} it is the evaluator supplied at
+   * construction time (also {@link Evaluator#noop()} unless a custom one is wired in).
+   *
+   * @return the evaluator, never {@code null}
+   */
+  public Evaluator getEvaluator() {
+    return evaluator;
+  }
 }
@@ -29,7 +29,7 @@ public final class AIJudgeConfig extends AIConfig {
       List<Message> messages,
       String evaluationMetricKey,
       Supplier<LDAIConfigTracker> trackerFactory) {
-    super(key, enabled, Mode.JUDGE, model, provider, trackerFactory);
+    super(key, enabled, Mode.JUDGE, model, provider, trackerFactory, Evaluator.noop());
     this.messages = messages == null ? null : Collections.unmodifiableList(messages);
     this.evaluationMetricKey = evaluationMetricKey;
   }

@@ -0,0 +1,94 @@
+package com.launchdarkly.sdk.server.ai;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.JudgeConfiguration;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.concurrent.CompletableFuture;
+
+/**
+ * Coordinates evaluation of an AI Config output by running a set of {@link Judge} instances.
+ * <p>
+ * An {@code Evaluator} is attached to an {@link AICompletionConfig} or {@link AIAgentConfig} and
+ * invoked by managed AI types (plan 4). In v1.0, the evaluator returned by the config retrieval
+ * methods is always a noop that returns an empty list immediately.
+ * <p>
+ * Instances are immutable and thread-safe.
+ */
+public final class Evaluator {
+  private static final Evaluator NOOP = new Evaluator();
+
+  private final Map<String, Judge> judges;
+  private final JudgeConfiguration judgeConfiguration;
+  private final LDLogger logger;
+  private final boolean isNoop;
+
+  private Evaluator() {
+    this.judges = Collections.emptyMap();
+    this.judgeConfiguration = null;
+    this.logger = null;
+    this.isNoop = true;
+  }
+
+  /**
+   * Constructs an evaluator with the given judges and configuration.
+   *
+   * @param judges a map from judge config key to {@link Judge} instance; must not be {@code null}
+   * @param judgeConfiguration the judge configuration listing which judges to run and their sampling
+   *     rates; must not be {@code null}
+   * @param logger the logger; must not be {@code null}
+   */
+  public Evaluator(Map<String, Judge> judges, JudgeConfiguration judgeConfiguration, LDLogger logger) {
+    this.judges = Collections.unmodifiableMap(new HashMap<>(Objects.requireNonNull(judges, "judges")));
+    this.judgeConfiguration = Objects.requireNonNull(judgeConfiguration, "judgeConfiguration");
+    this.logger = Objects.requireNonNull(logger, "logger");
+    this.isNoop = false;
+  }
+
+  /**
+   * Returns the shared noop evaluator, which immediately returns an empty result list without
+   * logging any warnings.
+   *
+   * @return the noop singleton, never {@code null}
+   */
+  public static Evaluator noop() {
+    return NOOP;
+  }
+
+  /**
+   * Runs all configured judges against the given input/output pair and returns their results.
+   * <p>
+   * When this is the noop evaluator, returns a completed future holding an empty list immediately.
+   * Otherwise, judges are run sequentially in the order specified by the {@link JudgeConfiguration}.
+   * Judges referenced in the configuration but absent from the judges map are skipped with a
+   * warning; this is not an error.
+   * <p>
+   * This method does NOT call {@code trackJudgeResult} — that is the caller's responsibility.
+   *
+   * @param input the message history or prompt that was sent to the model
+   * @param output the model's response to evaluate
+   * @return a completed future holding the list of judge results; never {@code null}
+   */
+  public CompletableFuture<List<JudgeResult>> evaluate(String input, String output) {
+    if (isNoop) {
+      return CompletableFuture.completedFuture(Collections.emptyList());
+    }
+
+    List<JudgeResult> results = new ArrayList<>();
+    for (JudgeConfiguration.Judge entry : judgeConfiguration.getJudges()) {
+      Judge judge = judges.get(entry.getKey());
+      if (judge == null) {
+        logger.warn("Evaluator: no judge found for key '{}', skipping", entry.getKey());
+        continue;
+      }
+      results.add(judge.evaluate(input, output, entry.getSamplingRate()));
+    }
+    return CompletableFuture.completedFuture(results);
+  }
+}
@@ -0,0 +1,210 @@
+package com.launchdarkly.sdk.server.ai;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.stream.Collectors;
+
+/**
+ * Evaluates an AI model output against a judge prompt, returning a scored {@link JudgeResult}.
+ * <p>
+ * A {@code Judge} wraps an {@link AIJudgeConfig} and a {@link Runner}. Each call to
+ * {@link #evaluate} or {@link #evaluateMessages} invokes the runner with a formatted evaluation
+ * prompt and parses the structured {@code {score, reasoning}} response. Evaluation can be sampled
+ * to reduce cost: pass a {@code samplingRate} of {@code 0.0} to always skip, or {@code 1.0} to
+ * always run.
+ * <p>
+ * Instances are immutable and thread-safe.
+ */
+public final class Judge {
+  /**
+   * JSON-Schema fragment sent to the runner as the {@code outputType}, requesting structured
+   * {@code {score, reasoning}} output.
+   */
+  private static final Map<String, Object> EVALUATION_SCHEMA;
+  static {
+    Map<String, Object> scoreSchema = new HashMap<>();
+    scoreSchema.put("type", "number");
+
+    Map<String, Object> reasoningSchema = new HashMap<>();
+    reasoningSchema.put("type", "string");
+
+    Map<String, Object> properties = new HashMap<>();
+    properties.put("score", Collections.unmodifiableMap(scoreSchema));
+    properties.put("reasoning", Collections.unmodifiableMap(reasoningSchema));
+
+    Map<String, Object> schema = new HashMap<>();
+    schema.put("type", "object");
+    schema.put("properties", Collections.unmodifiableMap(properties));
+    schema.put("required", Collections.singletonList("score"));
+
+    EVALUATION_SCHEMA = Collections.unmodifiableMap(schema);
+  }
+
+  private final AIJudgeConfig config;
+  private final Runner runner;
+  private final LDLogger logger;
+
+  /**
+   * Constructs a judge.
+   *
+   * @param config the judge AI Config; must not be {@code null}
+   * @param runner the runner to invoke; must not be {@code null}
+   * @param logger the logger; must not be {@code null}
+   */
+  public Judge(AIJudgeConfig config, Runner runner, LDLogger logger) {
+    this.config = Objects.requireNonNull(config, "config");
+    this.runner = Objects.requireNonNull(runner, "runner");
+    this.logger = Objects.requireNonNull(logger, "logger");
+  }
+
+  /**
+   * Evaluates the given input/output pair, always running (sampling rate {@code 1.0}).
+   *
+   * @param input the message history or prompt that was sent to the model
+   * @param output the model's response to evaluate
+   * @return the evaluation result; never {@code null}
+   */
+  public JudgeResult evaluate(String input, String output) {
+    return evaluate(input, output, 1.0);
+  }
+
+  /**
+   * Evaluates the given input/output pair, subject to the given sampling rate.
+   *
+   * @param input the message history or prompt that was sent to the model
+   * @param output the model's response to evaluate
+   * @param samplingRate the fraction of evaluations to actually run; {@code 0.0} always skips,
+   *     {@code 1.0} always runs
+   * @return the evaluation result; never {@code null}
+   */
+  public JudgeResult evaluate(String input, String output, double samplingRate) {
+    if (ThreadLocalRandom.current().nextDouble() >= samplingRate) {
+      return JudgeResult.builder()
+          .sampled(false)
+          .success(false)
+          .judgeConfigKey(config.getKey())
+          .metricKey(config.getEvaluationMetricKey())
+          .build();
+    }
+
+    String formatted = "MESSAGE HISTORY:\n" + input + "\n\nRESPONSE TO EVALUATE:\n" + output;
+    LDAIConfigTracker tracker = config.createTracker();
+
+    RunnerResult result;
+    try {
+      result = tracker.trackMetricsOf(RunnerResult::getMetrics, () -> runner.run(formatted, EVALUATION_SCHEMA));
+    } catch (Exception ex) {
+      return JudgeResult.builder()
+          .sampled(true)
+          .success(false)
+          .judgeConfigKey(config.getKey())
+          .metricKey(config.getEvaluationMetricKey())
+          .errorMessage(ex.getMessage())
+          .build();
+    }
+
+    Map<String, Object> parsed = result.getParsed();
+    if (parsed == null) {
+      logger.warn("Judge {}: runner returned null parsed output", config.getKey());
+      return JudgeResult.builder()
+          .sampled(true)
+          .success(false)
+          .judgeConfigKey(config.getKey())
+          .metricKey(config.getEvaluationMetricKey())
+          .build();
+    }
+
+    Object scoreRaw = parsed.get("score");
+    if (!(scoreRaw instanceof Number)) {
+      logger.warn("Judge {}: parsed output missing numeric score", config.getKey());
+      return JudgeResult.builder()
+          .sampled(true)
+          .success(false)
+          .judgeConfigKey(config.getKey())
+          .metricKey(config.getEvaluationMetricKey())
+          .build();
+    }
+    double score = ((Number) scoreRaw).doubleValue();
+    if (!Double.isFinite(score) || score < 0.0 || score > 1.0) {
+      logger.warn("Judge {}: score {} is outside [0.0, 1.0]", config.getKey(), score);
+      return JudgeResult.builder()
+          .sampled(true)
+          .success(false)
+          .judgeConfigKey(config.getKey())
+          .metricKey(config.getEvaluationMetricKey())
+          .build();
+    }
+
+    JudgeResult.Builder resultBuilder = JudgeResult.builder()
+        .sampled(true)
+        .success(true)
+        .judgeConfigKey(config.getKey())
+        .metricKey(config.getEvaluationMetricKey())
+        .score(score);
+
+    Object reasoningRaw = parsed.get("reasoning");
+    if (reasoningRaw instanceof String) {
+      resultBuilder.reasoning((String) reasoningRaw);
+    } else if (reasoningRaw != null) {
+      logger.warn("Judge {}: reasoning is not a string, ignoring", config.getKey());
+    }
+
+    return resultBuilder.build();
+  }
+
+  /**
+   * Evaluates a message list and runner response, always running (sampling rate {@code 1.0}).
+   * <p>
+   * Messages are formatted as {@code role: content} lines, joined by newlines.
+   *
+   * @param messages the messages that were sent to the model
+   * @param response the runner result whose {@link RunnerResult#getContent() content} is evaluated
+   * @return the evaluation result; never {@code null}
+   */
+  public JudgeResult evaluateMessages(List<Message> messages, RunnerResult response) {
+    return evaluateMessages(messages, response, 1.0);
+  }
+
+  /**
+   * Evaluates a message list and runner response, subject to the given sampling rate.
+   * <p>
+   * Messages are formatted as {@code role: content} lines, joined by newlines.
+   *
+   * @param messages the messages that were sent to the model
+   * @param response the runner result whose {@link RunnerResult#getContent() content} is evaluated
+   * @param samplingRate the fraction of evaluations to actually run
+   * @return the evaluation result; never {@code null}
+   */
+  public JudgeResult evaluateMessages(List<Message> messages, RunnerResult response, double samplingRate) {
+    String formattedMessages = messages == null ? "" : messages.stream()
+        .map(m -> m.getRole().getWireValue() + ": " + m.getContent())
+        .collect(Collectors.joining("\n"));
+    return evaluate(formattedMessages, response == null ? "" : response.getContent(), samplingRate);
+  }
+
+  /**
+   * Returns the judge AI Config this instance was constructed with.
+   *
+   * @return the judge config, never {@code null}
+   */
+  public AIJudgeConfig getConfig() {
+    return config;
+  }
+
+  /**
+   * Returns the runner this instance was constructed with.
+   *
+   * @return the runner, never {@code null}
+   */
+  public Runner getRunner() {
+    return runner;
+  }
+}