You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
59 lines
2.0 KiB
59 lines
2.0 KiB
1 month ago
|
import json
|
||
|
import os
|
||
|
import argparse
|
||
|
from json_repair import repair_json
|
||
|
|
||
|
def is_valid_json(json_str):
|
||
|
"""检查字符串是否是有效的JSON"""
|
||
|
try:
|
||
|
json.loads(json_str)
|
||
|
return True
|
||
|
except json.JSONDecodeError:
|
||
|
return False
|
||
|
|
||
|
def process_jsonl(input_file, output_file):
|
||
|
if not os.path.exists(input_file):
|
||
|
raise FileNotFoundError(f"输入文件不存在: {input_file}")
|
||
|
|
||
|
id_counter = 1 # 自增ID计数器
|
||
|
|
||
|
with open(input_file, 'r', encoding='utf-8') as infile, \
|
||
|
open(output_file, 'w', encoding='utf-8') as outfile:
|
||
|
|
||
|
for line_number, line in enumerate(infile, start=1):
|
||
|
line = line.strip()
|
||
|
if not line:
|
||
|
continue
|
||
|
|
||
|
# 如果能解析就处理
|
||
|
if is_valid_json(line):
|
||
|
data = json.loads(line)
|
||
|
else:
|
||
|
# 尝试修复
|
||
|
try:
|
||
|
repaired = repair_json(line)
|
||
|
if repaired and is_valid_json(repaired):
|
||
|
data = json.loads(repaired)
|
||
|
else:
|
||
|
continue
|
||
|
except Exception as e:
|
||
|
print(f"第 {line_number} 行: 无法修复的JSON行: {line} | 错误: {e}")
|
||
|
continue
|
||
|
|
||
|
# 添加自增 id 字段
|
||
|
data['id'] = id_counter
|
||
|
id_counter += 1
|
||
|
|
||
|
# 写入文件,保留中文等非 ASCII 字符
|
||
|
outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
|
||
|
|
||
|
print(f"处理完成,共写入 {id_counter - 1} 条数据。结果保存到:{output_file}")
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
parser = argparse.ArgumentParser(description="处理 JSONL 文件,添加自增 ID,并修复非法 JSON")
|
||
|
parser.add_argument("--input", type=str, required=True, help="输入的 JSONL 文件路径")
|
||
|
parser.add_argument("--output", type=str, required=True, help="输出的 JSONL 文件路径")
|
||
|
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
process_jsonl(args.input, args.output)
|