You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

68 lines
3.3 KiB

# 用于交付前重制id列表
import argparse
import jsonlines
from tqdm import tqdm
import re
# 输入文件路径和输出文件路径
def parse_args():
parser = argparse.ArgumentParser(description="处理Major2")
parser.add_argument("--input", required=True, help="Input JSONL file path")
parser.add_argument("--output", required=True, help="Input JSONL file path")
parser.add_argument("--major", required=True, help="输入科目")
return parser.parse_args()
# 获取输入文件的总行数(用于进度条)
def count_lines(file_path):
with open(file_path, "r", encoding="utf-8") as f:
return sum(1 for _ in f)
def process_file(input_file, output_file,major2_keywords):
total_lines = count_lines(input_file)
# 打开输入文件进行逐行读取,打开输出文件进行逐行写入
with jsonlines.open(input_file, mode="r") as reader, jsonlines.open(output_file, mode="w") as writer:
# 使用 tqdm 包裹 reader,显示进度条 [[10]]
for line in tqdm(reader, total=total_lines, desc="Processing", unit="line"):
# 判断major2是否有效
if line.get("major_2"):
if any(keyword in line["major_2"] for keyword in major2_keywords):
# 去除line["major_2"]开头的空格
line["major_2"] = line["major_2"].lstrip()
# 判断line["major_2"]中是否包含英文字符
if any('a' <= char.lower() <= 'z' for char in line["major_2"]):
print("major2中包含英文字符,请检查!内容是:"+str(line))
continue
# 写入修改后的行到输出文件
# 去除*、/、\、!、空格符号
line["major_2"] = line["major_2"].replace('*', '').replace('/', '').replace('\\', '').replace('!', '').replace(' ', '')
# 去除括号及其内容
line["major_2"] = re.sub(r'$$|$$|(|)|$|$|{|}|〈|〉|《|》|『|』|【|】|〖|〗|〘|〙|〚|〛', '', line["major_2"])
writer.write(line)
else:
print("major2不合法,跳过该行,内容是:"+str(line))
continue
def start_process_major2(input,output,major):
major2_keywords = []
if major == "化学":
#化学、有机、无机、分子
major2_keywords = ["有机","无机","分子","化学"]
elif major == "物理":
#物理、天体、宇宙、行星、黑洞、原子、核、力、动力、力学、流体、光、光子、电、电子、磁、量子、超导、热、纳米、晶体、半导体、能量、相对、波动、振动
major2_keywords = ["天体","宇宙","行星","黑洞","原子","","","动力","力学","流体","","光子","","电子","","量子","超导","","纳米","晶体","半导体","能量","相对","波动","振动","物理"]
else:
print("请输入正确的科目!")
exit()
process_file(input, output,major2_keywords)
print("major2清理完成,已保存到新文件:", output)
if __name__ == "__main__":
args = parse_args()
parser = argparse.ArgumentParser(description="JSONL格式验证工具")
start_process_major2(args.input,args.output,args.major)