DeepResearch-Eval/judge_fact.py at main · HKUDS/DeepResearch-Eval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import copy
import os
import re
import json
import argparse
from tqdm import tqdm
from typing import Dict, Any

from Atools import SearchAgent, check_factual


def normalize_url(raw_key: str) -> str:
    """
    Input may be like "https://a.com/x](https://a.com/x)" or contain markdown fragments,
    normalize to a usable http(s) URL.
    Strategy: prioritize the last URL fragment starting with http(s)://.
    """
    # Match http or https links, try to exclude extra ) or ] at the end
    candidates = re.findall(r"https?://[^\s)\]]+", raw_key)
    if not candidates:
        return raw_key.strip()
    # Take the last one, usually the real URL
    return candidates[-1]

def process_obj(agent: SearchAgent, data: Dict[str, Any], provider: str) -> Dict[str, Any]:
    """
    Process a single JSON object (like { url_key: {"contexts": [...], ...}, ... }).
    Return an object with normalized URL as key and 'md' field added.
    """
    normalized_data: Dict[str, Any] = {}
    for raw_url, payload in tqdm(data.items()):
        url = normalize_url(raw_url)
        payload_copy: Dict[str, Any] = copy.deepcopy(payload)
        try:
            page_content = agent.scrape(url, provider=provider)
        except Exception as e:
            page_content = f"__SCRAPE_ERROR__: {e}"
        payload_copy['md'] = page_content
        normalized_data[url] = payload_copy

    return normalized_data

def process_record_judge(agent: SearchAgent, obj: Dict[str, Any], provider: str):
    """
    Process a record (like { raw_url: {"contexts": [...], ...} }):
    - Normalize URL
    - Scrape md
    - Call check_factual for each context, output {url, context, label}
    Return: List[Dict]
    """
    if not isinstance(obj, Dict) or len(obj) != 1:
        return [{
            "__PARSE_ERROR__": "Each line must contain exactly one key (url).",
            "__raw__": obj
        }]
    (raw_url, payload), = obj.items()
    url = normalize_url(raw_url)
    contexts = []
    if isinstance(payload, dict):
        contexts = payload.get("contexts", [])
    try:
        page_content = agent.scrape(url, provider=provider)
    except Exception as e:
        page_content = f"__SCRAPE_ERROR__: {e}"
    results = []
    for c in contexts:
        if not isinstance(c, str):
            continue
        try:
            label = check_factual(c, page_content)
        except Exception as e:
            label = f"__ERROR__: {e}"
        results.append({"url": url, "context": c, "label": label})
    return results

def main():
    parser = argparse.ArgumentParser(description="Compare contexts with scraped pages and summarize -1/0/1.")
    parser.add_argument("--inputpath", required=True, help="Input .jsonl file path, process line by line")
    parser.add_argument("--outputpath", required=True, help="Output file path or directory (will generate same-named .out.jsonl or .judge.jsonl in directory)")
    parser.add_argument("--provider", choices=["firecrawl", "jina"], default="jina", help="Scraping provider")
    parser.add_argument("--limit", type=int, default=3, help="SearchAgent.num_limit_pages")
    parser.add_argument("--task", choices=["scrape", "judge"], default="judge", help="scrape only scrapes and outputs objects with md; judge directly outputs judgment results")
    args = parser.parse_args()

    input_abs = os.path.abspath(args.inputpath)
    output_abs = os.path.abspath(args.outputpath)

    agent = SearchAgent(num_limit_pages=args.limit)

    # Only support .jsonl streaming processing
    if not (input_abs.lower().endswith(".jsonl") and os.path.isfile(input_abs)):
        raise ValueError("--inputpath must be an existing .jsonl file")

    # Determine output path: if given a directory, create same-named .out.jsonl or .judge.jsonl in it
    if os.path.isdir(output_abs):
        base = os.path.splitext(os.path.basename(input_abs))[0]
        suffix = "judge.jsonl" if args.task == "judge" else "out.jsonl"
        out_jsonl = os.path.join(output_abs, f"{base}.{suffix}")
    else:
        # Treat as file path
        os.makedirs(os.path.dirname(output_abs) or ".", exist_ok=True)
        out_jsonl = output_abs

    count = 0
    with open(input_abs, "r", encoding="utf-8") as fin, \
         open(out_jsonl, "w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except Exception as e:
                fout.write(json.dumps({"__PARSE_ERROR__": str(e), "__raw__": line}, ensure_ascii=False) + "\n")
                continue
            if args.task == "scrape":
                result = process_obj(agent, obj, provider=args.provider)
                fout.write(json.dumps(result, ensure_ascii=False) + "\n")
            else:
                results = process_record_judge(agent, obj, provider=args.provider)
                for r in results:
                    fout.write(json.dumps(r, ensure_ascii=False) + "\n")
            count += 1
    print(f"Saved JSONL: {out_jsonl} (lines: {count})")
    return


if __name__ == "__main__":
    main()