You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
3.3 KiB
68 lines
3.3 KiB
# 用于交付前重制id列表
|
|
|
|
import argparse
|
|
import jsonlines
|
|
from tqdm import tqdm
|
|
import re
|
|
|
|
# 输入文件路径和输出文件路径
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="处理Major2")
|
|
parser.add_argument("--input", required=True, help="Input JSONL file path")
|
|
parser.add_argument("--output", required=True, help="Input JSONL file path")
|
|
parser.add_argument("--major", required=True, help="输入科目")
|
|
return parser.parse_args()
|
|
|
|
|
|
# 获取输入文件的总行数(用于进度条)
|
|
def count_lines(file_path):
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
return sum(1 for _ in f)
|
|
def process_file(input_file, output_file,major2_keywords):
|
|
total_lines = count_lines(input_file)
|
|
# 打开输入文件进行逐行读取,打开输出文件进行逐行写入
|
|
with jsonlines.open(input_file, mode="r") as reader, jsonlines.open(output_file, mode="w") as writer:
|
|
# 使用 tqdm 包裹 reader,显示进度条 [[10]]
|
|
for line in tqdm(reader, total=total_lines, desc="Processing", unit="line"):
|
|
# 判断major2是否有效
|
|
if line.get("major_2"):
|
|
if any(keyword in line["major_2"] for keyword in major2_keywords):
|
|
# 去除line["major_2"]开头的空格
|
|
line["major_2"] = line["major_2"].lstrip()
|
|
# 判断line["major_2"]中是否包含英文字符
|
|
if any('a' <= char.lower() <= 'z' for char in line["major_2"]):
|
|
print("major2中包含英文字符,请检查!内容是:"+str(line))
|
|
continue
|
|
# 写入修改后的行到输出文件
|
|
# 去除*、/、\、!、空格符号
|
|
line["major_2"] = line["major_2"].replace('*', '').replace('/', '').replace('\\', '').replace('!', '').replace(' ', '')
|
|
# 去除括号及其内容
|
|
line["major_2"] = re.sub(r'$$|$$|(|)|$|$|{|}|〈|〉|《|》|『|』|【|】|〖|〗|〘|〙|〚|〛', '', line["major_2"])
|
|
writer.write(line)
|
|
else:
|
|
print("major2不合法,跳过该行,内容是:"+str(line))
|
|
continue
|
|
|
|
|
|
|
|
def start_process_major2(input,output,major):
|
|
major2_keywords = []
|
|
if major == "化学":
|
|
#化学、有机、无机、分子
|
|
major2_keywords = ["有机","无机","分子","化学"]
|
|
elif major == "物理":
|
|
#物理、天体、宇宙、行星、黑洞、原子、核、力、动力、力学、流体、光、光子、电、电子、磁、量子、超导、热、纳米、晶体、半导体、能量、相对、波动、振动
|
|
major2_keywords = ["天体","宇宙","行星","黑洞","原子","核","力","动力","力学","流体","光","光子","电","电子","磁","量子","超导","热","纳米","晶体","半导体","能量","相对","波动","振动","物理"]
|
|
else:
|
|
print("请输入正确的科目!")
|
|
exit()
|
|
process_file(input, output,major2_keywords)
|
|
print("major2清理完成,已保存到新文件:", output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
parser = argparse.ArgumentParser(description="JSONL格式验证工具")
|
|
start_process_major2(args.input,args.output,args.major)
|
|
|
|
|