# 用于交付前重制id列表 import argparse import jsonlines from tqdm import tqdm import re # 输入文件路径和输出文件路径 def parse_args(): parser = argparse.ArgumentParser(description="处理Major2") parser.add_argument("--input", required=True, help="Input JSONL file path") parser.add_argument("--output", required=True, help="Input JSONL file path") parser.add_argument("--major", required=True, help="输入科目") return parser.parse_args() # 获取输入文件的总行数(用于进度条) def count_lines(file_path): with open(file_path, "r", encoding="utf-8") as f: return sum(1 for _ in f) def process_file(input_file, output_file,major2_keywords): total_lines = count_lines(input_file) # 打开输入文件进行逐行读取,打开输出文件进行逐行写入 with jsonlines.open(input_file, mode="r") as reader, jsonlines.open(output_file, mode="w") as writer: # 使用 tqdm 包裹 reader,显示进度条 [[10]] for line in tqdm(reader, total=total_lines, desc="Processing", unit="line"): # 判断major2是否有效 if line.get("major_2"): if any(keyword in line["major_2"] for keyword in major2_keywords): # 去除line["major_2"]开头的空格 line["major_2"] = line["major_2"].lstrip() # 判断line["major_2"]中是否包含英文字符 if any('a' <= char.lower() <= 'z' for char in line["major_2"]): print("major2中包含英文字符,请检查!内容是:"+str(line)) continue # 写入修改后的行到输出文件 # 去除*、/、\、!、空格符号 line["major_2"] = line["major_2"].replace('*', '').replace('/', '').replace('\\', '').replace('!', '').replace(' ', '') # 去除括号及其内容 line["major_2"] = re.sub(r'$$|$$|(|)|$|$|{|}|〈|〉|《|》|『|』|【|】|〖|〗|〘|〙|〚|〛', '', line["major_2"]) writer.write(line) else: print("major2不合法,跳过该行,内容是:"+str(line)) continue def start_process_major2(input,output,major): major2_keywords = [] if major == "化学": #化学、有机、无机、分子 major2_keywords = ["有机","无机","分子","化学"] elif major == "物理": #物理、天体、宇宙、行星、黑洞、原子、核、力、动力、力学、流体、光、光子、电、电子、磁、量子、超导、热、纳米、晶体、半导体、能量、相对、波动、振动 major2_keywords = ["天体","宇宙","行星","黑洞","原子","核","力","动力","力学","流体","光","光子","电","电子","磁","量子","超导","热","纳米","晶体","半导体","能量","相对","波动","振动","物理"] else: print("请输入正确的科目!") exit() process_file(input, output,major2_keywords) print("major2清理完成,已保存到新文件:", output) if __name__ == "__main__": args = parse_args() parser = argparse.ArgumentParser(description="JSONL格式验证工具") start_process_major2(args.input,args.output,args.major)