You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

59 lines
2.0 KiB

1 month ago
import json
import os
import argparse
from json_repair import repair_json
def is_valid_json(json_str):
"""检查字符串是否是有效的JSON"""
try:
json.loads(json_str)
return True
except json.JSONDecodeError:
return False
def process_jsonl(input_file, output_file):
if not os.path.exists(input_file):
raise FileNotFoundError(f"输入文件不存在: {input_file}")
# id_counter = 1 # 自增ID计数器
with open(input_file, 'r', encoding='utf-8') as infile, \
open(output_file, 'w', encoding='utf-8') as outfile:
for line_number, line in enumerate(infile, start=1):
line = line.strip()
if not line:
continue
# 如果能解析就处理
if is_valid_json(line):
data = json.loads(line)
else:
# 尝试修复
try:
repaired = repair_json(line)
if repaired and is_valid_json(repaired):
data = json.loads(repaired)
else:
continue
except Exception as e:
print(f"{line_number} 行: 无法修复的JSON行: {line} | 错误: {e}")
continue
# 添加自增 id 字段
# data['id'] = id_counter
# id_counter += 1
# 写入文件,保留中文等非 ASCII 字符
outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
# print(f"处理完成,共写入 {id_counter - 1} 条数据。结果保存到:{output_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="处理 JSONL 文件,添加自增 ID,并修复非法 JSON")
parser.add_argument("--input", type=str, required=True, help="输入的 JSONL 文件路径")
parser.add_argument("--output", type=str, required=True, help="输出的 JSONL 文件路径")
args = parser.parse_args()
process_jsonl(args.input, args.output)