Metrics aggregate verification rewards across multiple trials to provide summary statistics and insights. They transform individual trial rewards into meaningful performance indicators.Harbor includes built-in metrics (mean, sum, min, max) and supports custom metric scripts for specialized evaluation.
#!/usr/bin/env python3"""Custom metric: compute pass rate and average score."""import argparseimport jsondef main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", required=True, help="Input JSONL file") parser.add_argument("-o", "--output", required=True, help="Output JSON file") args = parser.parse_args() # Read rewards rewards = [] with open(args.input) as f: for line in f: reward = json.loads(line.strip()) rewards.append(reward) # Compute metrics total = len(rewards) passed = sum(1 for r in rewards if r and r.get("reward", 0) > 0.5) scores = [r["reward"] for r in rewards if r] result = { "pass_rate": passed / total if total > 0 else 0, "average_score": sum(scores) / len(scores) if scores else 0, "total_trials": total, "passed_trials": passed, } # Write output with open(args.output, "w") as f: json.dump(result, f)if __name__ == "__main__": main()
class MetricFactory: _METRIC_MAP: dict[MetricType, type[BaseMetric]] = { MetricType.SUM: Sum, MetricType.MIN: Min, MetricType.MAX: Max, MetricType.MEAN: Mean, MetricType.UV_SCRIPT: UvScript, } @classmethod def create_metric( cls, metric_type: MetricType, **kwargs, ) -> BaseMetric: if metric_type not in cls._METRIC_MAP: raise ValueError( f"Unsupported metric type: {metric_type}. This could be because the " "metric is not registered in the MetricFactory or because the metric " "type is invalid." ) return cls._METRIC_MAP[metric_type](**kwargs)
class Job: def _init_metrics(self): self._metrics: dict[str, list[BaseMetric]] = defaultdict(list) # Job-level metrics job_metrics = [ MetricFactory.create_metric(metric.type, **metric.kwargs) for metric in self.config.metrics ] self._metrics["adhoc"].extend(job_metrics) # Dataset-specific metrics for dataset_config in self.config.datasets: if isinstance(dataset_config, RegistryDatasetConfig): client = RegistryClientFactory.create(dataset_config.registry) dataset = client.get_dataset_spec( dataset_config.name, dataset_config.version ) self._metrics[dataset_config.name].extend( [ MetricFactory.create_metric(metric.type, **metric.kwargs) for metric in dataset.metrics ] ) self._metrics[dataset_config.name].extend(job_metrics) # Default to Mean if no metrics specified for name, metrics in self._metrics.items(): if len(metrics) == 0: self._metrics[name].append(Mean())
#!/usr/bin/env python3import argparseimport jsondef main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", required=True) parser.add_argument("-o", "--output", required=True) args = parser.parse_args() rewards = [] with open(args.input) as f: for line in f: rewards.append(json.loads(line.strip())) total = len(rewards) passed = sum(1 for r in rewards if r and r.get("reward", 0) >= 1.0) result = {"pass_rate": passed / total if total > 0 else 0} with open(args.output, "w") as f: json.dump(result, f)if __name__ == "__main__": main()