You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
100 lines
3.9 KiB
100 lines
3.9 KiB
1 month ago
|
# 用于交付前重制id列表
|
||
|
# 同时处理每条数据,删除问题内容前的序号
|
||
|
|
||
|
import argparse
|
||
|
import jsonlines
|
||
|
from tqdm import tqdm
|
||
|
import re
|
||
|
|
||
|
html_tag_pattern = re.compile(r'<[^>]+>') # 用于检测 HTML 标签
|
||
|
unicode_pattern = r'\\u[0-9a-fA-F]{4,}'
|
||
|
# 输入文件路径和输出文件路径
|
||
|
def parse_args():
|
||
|
parser = argparse.ArgumentParser(description="Find high Jaccard similarity entries in JSONL file")
|
||
|
parser.add_argument("--input", required=True, help="Input JSONL file path")
|
||
|
parser.add_argument("--output", required=True, help="Input JSONL file path")
|
||
|
parser.add_argument("--major", required=True, help="Input JSONL file path")
|
||
|
parser.add_argument("--start_id", type=int, default=1, help="Start ID (default: 0)")
|
||
|
return parser.parse_args()
|
||
|
|
||
|
|
||
|
# 获取输入文件的总行数(用于进度条)
|
||
|
def count_lines(file_path):
|
||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||
|
return sum(1 for _ in f)
|
||
|
def process_file(input_file, output_file,start_id,major):
|
||
|
total_lines = count_lines(input_file)
|
||
|
# 打开输入文件进行逐行读取,打开输出文件进行逐行写入
|
||
|
with jsonlines.open(input_file, mode="r") as reader, jsonlines.open(output_file, mode="w") as writer:
|
||
|
new_id = start_id # 初始化新的 ID 从 1 开始
|
||
|
|
||
|
# 使用 tqdm 包裹 reader,显示进度条 [[10]]
|
||
|
for line in tqdm(reader, total=total_lines, desc="Processing", unit="line"):
|
||
|
q_main = line.get("q_main", "")
|
||
|
answer_detail = line.get("answer_detail", None)
|
||
|
std_ans = line.get("std_ans", None)
|
||
|
keypoint = line.get("keypoint", None)
|
||
|
major_2 = line.get("major_2", None)
|
||
|
# 跳过条件:
|
||
|
# 1. q_main 以数字开头
|
||
|
# 2. answer_detail 不存在 或 不是字符串
|
||
|
# 3. std_ans 不存在 或 不是字符串
|
||
|
# if (
|
||
|
# re.match(r'^\s*\d(?!\d)', q_main) or
|
||
|
# not isinstance(answer_detail, str) or
|
||
|
# not isinstance(std_ans, str) or
|
||
|
# not isinstance(keypoint, str) or
|
||
|
# not isinstance(major_2, str) or
|
||
|
# html_tag_pattern.search(q_main) or
|
||
|
# html_tag_pattern.search(answer_detail) or
|
||
|
# html_tag_pattern.search(std_ans) or
|
||
|
# re.search(unicode_pattern, major_2)
|
||
|
# ):
|
||
|
# continue
|
||
|
# 修改当前行的 id 字段
|
||
|
line["id"] = new_id
|
||
|
# line["grade"] = "研究生"
|
||
|
# line["major"] = major
|
||
|
# line["q_main"] = full_to_half(remove_prefix(line["q_main"]))
|
||
|
# line["answer_detail"] = full_to_half(line["answer_detail"])
|
||
|
# line["std_ans"] = full_to_half(line["std_ans"])
|
||
|
# 写入修改后的行到输出文件
|
||
|
writer.write(line)
|
||
|
# 更新 ID
|
||
|
new_id += 1
|
||
|
|
||
|
def remove_prefix(question):
|
||
|
"""
|
||
|
移除 question 字段开头的序号,例如:
|
||
|
- "1.", "2.", "1. ", "2. "
|
||
|
- "1.", "2."
|
||
|
- "1、"
|
||
|
- "1)"
|
||
|
- "2 题目:"
|
||
|
- "1题:"、"2题:"
|
||
|
- 处理类似 "2\\.", "3\\." 这种形式
|
||
|
"""
|
||
|
# 正则表达式匹配各种可能的前缀
|
||
|
pattern = r'^\s*\d+[\..\\))、]|\d+\s*题目:|\d+题:'
|
||
|
result = re.sub(pattern, '', question).lstrip()
|
||
|
return result
|
||
|
def full_to_half(text):
|
||
|
"""将文本中的全角字符转换为半角字符"""
|
||
|
res = ''
|
||
|
for char in text:
|
||
|
code = ord(char)
|
||
|
if code == 12288: # 全角空格
|
||
|
res += chr(32)
|
||
|
elif 65281 <= code <= 65374: # 全角标点符号
|
||
|
res += chr(code - 65248)
|
||
|
else:
|
||
|
res += char
|
||
|
return res
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
args = parse_args()
|
||
|
parser = argparse.ArgumentParser(description="JSONL格式验证工具")
|
||
|
process_file(args.input, args.output,args.start_id,args.major)
|
||
|
print("ID 重置完成,已保存到新文件:", args.output)
|
||
|
|