# 用于交付前重制id列表 # 同时处理每条数据,删除问题内容前的序号 import argparse import jsonlines from tqdm import tqdm import re html_tag_pattern = re.compile(r'<[^>]+>') # 用于检测 HTML 标签 unicode_pattern = r'\\u[0-9a-fA-F]{4,}' # 输入文件路径和输出文件路径 def parse_args(): parser = argparse.ArgumentParser(description="Find high Jaccard similarity entries in JSONL file") parser.add_argument("--input", required=True, help="Input JSONL file path") parser.add_argument("--output", required=True, help="Input JSONL file path") parser.add_argument("--major", required=True, help="Input JSONL file path") parser.add_argument("--start_id", type=int, default=1, help="Start ID (default: 0)") return parser.parse_args() # 获取输入文件的总行数(用于进度条) def count_lines(file_path): with open(file_path, "r", encoding="utf-8") as f: return sum(1 for _ in f) def process_file(input_file, output_file,start_id,major): total_lines = count_lines(input_file) # 打开输入文件进行逐行读取,打开输出文件进行逐行写入 with jsonlines.open(input_file, mode="r") as reader, jsonlines.open(output_file, mode="w") as writer: new_id = start_id # 初始化新的 ID 从 1 开始 # 使用 tqdm 包裹 reader,显示进度条 [[10]] for line in tqdm(reader, total=total_lines, desc="Processing", unit="line"): q_main = line.get("q_main", "") answer_detail = line.get("answer_detail", None) std_ans = line.get("std_ans", None) keypoint = line.get("keypoint", None) major_2 = line.get("major_2", None) # 跳过条件: # 1. q_main 以数字开头 # 2. answer_detail 不存在 或 不是字符串 # 3. std_ans 不存在 或 不是字符串 # if ( # re.match(r'^\s*\d(?!\d)', q_main) or # not isinstance(answer_detail, str) or # not isinstance(std_ans, str) or # not isinstance(keypoint, str) or # not isinstance(major_2, str) or # html_tag_pattern.search(q_main) or # html_tag_pattern.search(answer_detail) or # html_tag_pattern.search(std_ans) or # re.search(unicode_pattern, major_2) # ): # continue # 修改当前行的 id 字段 line["id"] = new_id # line["grade"] = "研究生" # line["major"] = major # line["q_main"] = full_to_half(remove_prefix(line["q_main"])) # line["answer_detail"] = full_to_half(line["answer_detail"]) # line["std_ans"] = full_to_half(line["std_ans"]) # 写入修改后的行到输出文件 writer.write(line) # 更新 ID new_id += 1 def remove_prefix(question): """ 移除 question 字段开头的序号,例如: - "1.", "2.", "1. ", "2. " - "1.", "2." - "1、" - "1)" - "2 题目:" - "1题:"、"2题:" - 处理类似 "2\\.", "3\\." 这种形式 """ # 正则表达式匹配各种可能的前缀 pattern = r'^\s*\d+[\..\\))、]|\d+\s*题目:|\d+题:' result = re.sub(pattern, '', question).lstrip() return result def full_to_half(text): """将文本中的全角字符转换为半角字符""" res = '' for char in text: code = ord(char) if code == 12288: # 全角空格 res += chr(32) elif 65281 <= code <= 65374: # 全角标点符号 res += chr(code - 65248) else: res += char return res if __name__ == "__main__": args = parse_args() parser = argparse.ArgumentParser(description="JSONL格式验证工具") process_file(args.input, args.output,args.start_id,args.major) print("ID 重置完成,已保存到新文件:", args.output)