You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

100 lines
3.9 KiB

1 month ago
# 用于交付前重制id列表
# 同时处理每条数据,删除问题内容前的序号
import argparse
import jsonlines
from tqdm import tqdm
import re
html_tag_pattern = re.compile(r'<[^>]+>') # 用于检测 HTML 标签
unicode_pattern = r'\\u[0-9a-fA-F]{4,}'
# 输入文件路径和输出文件路径
def parse_args():
parser = argparse.ArgumentParser(description="Find high Jaccard similarity entries in JSONL file")
parser.add_argument("--input", required=True, help="Input JSONL file path")
parser.add_argument("--output", required=True, help="Input JSONL file path")
parser.add_argument("--major", required=True, help="Input JSONL file path")
parser.add_argument("--start_id", type=int, default=1, help="Start ID (default: 0)")
return parser.parse_args()
# 获取输入文件的总行数(用于进度条)
def count_lines(file_path):
with open(file_path, "r", encoding="utf-8") as f:
return sum(1 for _ in f)
def process_file(input_file, output_file,start_id,major):
total_lines = count_lines(input_file)
# 打开输入文件进行逐行读取,打开输出文件进行逐行写入
with jsonlines.open(input_file, mode="r") as reader, jsonlines.open(output_file, mode="w") as writer:
new_id = start_id # 初始化新的 ID 从 1 开始
# 使用 tqdm 包裹 reader,显示进度条 [[10]]
for line in tqdm(reader, total=total_lines, desc="Processing", unit="line"):
q_main = line.get("q_main", "")
answer_detail = line.get("answer_detail", None)
std_ans = line.get("std_ans", None)
keypoint = line.get("keypoint", None)
major_2 = line.get("major_2", None)
# 跳过条件:
# 1. q_main 以数字开头
# 2. answer_detail 不存在 或 不是字符串
# 3. std_ans 不存在 或 不是字符串
# if (
# re.match(r'^\s*\d(?!\d)', q_main) or
# not isinstance(answer_detail, str) or
# not isinstance(std_ans, str) or
# not isinstance(keypoint, str) or
# not isinstance(major_2, str) or
# html_tag_pattern.search(q_main) or
# html_tag_pattern.search(answer_detail) or
# html_tag_pattern.search(std_ans) or
# re.search(unicode_pattern, major_2)
# ):
# continue
# 修改当前行的 id 字段
line["id"] = new_id
# line["grade"] = "研究生"
# line["major"] = major
# line["q_main"] = full_to_half(remove_prefix(line["q_main"]))
# line["answer_detail"] = full_to_half(line["answer_detail"])
# line["std_ans"] = full_to_half(line["std_ans"])
# 写入修改后的行到输出文件
writer.write(line)
# 更新 ID
new_id += 1
def remove_prefix(question):
"""
移除 question 字段开头的序号例如
- "1.", "2.", "1. ", "2. "
- "1.", "2."
- "1、"
- "1)"
- "2 题目:"
- "1题:""2题:"
- 处理类似 "2\\.", "3\\." 这种形式
"""
# 正则表达式匹配各种可能的前缀
pattern = r'^\s*\d+[\..\\))、]|\d+\s*题目:|\d+题:'
result = re.sub(pattern, '', question).lstrip()
return result
def full_to_half(text):
"""将文本中的全角字符转换为半角字符"""
res = ''
for char in text:
code = ord(char)
if code == 12288: # 全角空格
res += chr(32)
elif 65281 <= code <= 65374: # 全角标点符号
res += chr(code - 65248)
else:
res += char
return res
if __name__ == "__main__":
args = parse_args()
parser = argparse.ArgumentParser(description="JSONL格式验证工具")
process_file(args.input, args.output,args.start_id,args.major)
print("ID 重置完成,已保存到新文件:", args.output)