import json import os import argparse from json_repair import repair_json def is_valid_json(json_str): """检查字符串是否是有效的JSON""" try: json.loads(json_str) return True except json.JSONDecodeError: return False def process_jsonl(input_file, output_file): if not os.path.exists(input_file): raise FileNotFoundError(f"输入文件不存在: {input_file}") # id_counter = 1 # 自增ID计数器 with open(input_file, 'r', encoding='utf-8') as infile, \ open(output_file, 'w', encoding='utf-8') as outfile: for line_number, line in enumerate(infile, start=1): line = line.strip() if not line: continue # 如果能解析就处理 if is_valid_json(line): data = json.loads(line) else: # 尝试修复 try: repaired = repair_json(line) if repaired and is_valid_json(repaired): data = json.loads(repaired) else: continue except Exception as e: print(f"第 {line_number} 行: 无法修复的JSON行: {line} | 错误: {e}") continue # 添加自增 id 字段 # data['id'] = id_counter # id_counter += 1 # 写入文件,保留中文等非 ASCII 字符 outfile.write(json.dumps(data, ensure_ascii=False) + '\n') # print(f"处理完成,共写入 {id_counter - 1} 条数据。结果保存到:{output_file}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="处理 JSONL 文件,添加自增 ID,并修复非法 JSON") parser.add_argument("--input", type=str, required=True, help="输入的 JSONL 文件路径") parser.add_argument("--output", type=str, required=True, help="输出的 JSONL 文件路径") args = parser.parse_args() process_jsonl(args.input, args.output)