You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
135 lines
4.2 KiB
135 lines
4.2 KiB
1 month ago
|
|
||
|
# Start
|
||
|
import json
|
||
|
import jsonlines
|
||
|
|
||
|
|
||
|
def load_ori_file(ori_path,question_type_list):
|
||
|
ori_data_list = []
|
||
|
line_count = 0
|
||
|
with open(ori_path, 'r', encoding='utf-8') as f:
|
||
|
for line in f:
|
||
|
# 将每行解析为 JSON 对象
|
||
|
try:
|
||
|
line_count += 1
|
||
|
data = json.loads(line.strip())
|
||
|
# 判断题型
|
||
|
# if data["type"] not in question_type_list:
|
||
|
# continue
|
||
|
# 简单回答题型过滤
|
||
|
# if is_easy_answer(str(data["answer"])):
|
||
|
# continue
|
||
|
# # 初步校验
|
||
|
if ori_data_validate(data):
|
||
|
ori_data_list.append(data)
|
||
|
except json.JSONDecodeError:
|
||
|
print(f"Error decoding JSON on line {line_count}: {line.strip()}")
|
||
|
line_count += 1
|
||
|
continue
|
||
|
print(f"Total lines processed: {line_count}")
|
||
|
return ori_data_list
|
||
|
|
||
|
|
||
|
# 用于原始题目数据有效性的验证
|
||
|
def ori_data_validate(ori_data_str):
|
||
|
try:
|
||
|
if "如图" in ori_data_str:
|
||
|
return False
|
||
|
else:
|
||
|
return True
|
||
|
except Exception as e:
|
||
|
print(e)
|
||
|
return False
|
||
|
|
||
|
# 判断是否为简单回答
|
||
|
def is_easy_answer(s):
|
||
|
# 判断长度不超过5,并且包含中文字符
|
||
|
if len(s) > 5:
|
||
|
return False
|
||
|
# 使用正则表达式判断是否包含中文
|
||
|
import re
|
||
|
if re.search(r'[\u4e00-\u9fa5]', s):
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
|
||
|
def generate_processed_data(ori_data_list,major,output_path):
|
||
|
start_id = 1
|
||
|
processed_data_list = []
|
||
|
difficulty_map = {
|
||
|
'困难': '难',
|
||
|
'高': '难',
|
||
|
'high': '难',
|
||
|
'5': '难',
|
||
|
'5星': '难',
|
||
|
'hard': '难',
|
||
|
'中': '中',
|
||
|
'medium': '中',
|
||
|
'4': '中',
|
||
|
'3': '中',
|
||
|
'4星': '中',
|
||
|
'3星': '中',
|
||
|
'低': '低',
|
||
|
'low': '低',
|
||
|
'1': '低',
|
||
|
'1星': '低',
|
||
|
'2': '低',
|
||
|
'2星': '低',
|
||
|
'easy': '低',
|
||
|
}
|
||
|
|
||
|
#major和major_2的映射关系
|
||
|
for ori_item in ori_data_list:
|
||
|
try:
|
||
|
# 处理难度
|
||
|
if ori_item['difficulty'] in difficulty_map:
|
||
|
ori_item['difficulty'] = difficulty_map[ori_item['difficulty']]
|
||
|
else:
|
||
|
# 如果不匹配任何规则,可以选择跳过或记录日志
|
||
|
continue
|
||
|
# 处理数据
|
||
|
processed_item = {
|
||
|
"grade_class": "高等教育",
|
||
|
"grade": "大学",
|
||
|
"major": major,
|
||
|
"major_2": ori_item['subject'],
|
||
|
"language": "zh",
|
||
|
"id": start_id,
|
||
|
"q_main": ori_item['question'],
|
||
|
"std_ans": ori_item['answer'],
|
||
|
"answer_detail": ori_item['analyzing'],
|
||
|
"hard_level": ori_item['difficulty'],
|
||
|
"keypoint": ori_item['knowledge_point'],
|
||
|
"q_type": ori_item['type']
|
||
|
}
|
||
|
#清理html标签、不可见字符、异常字符
|
||
|
processed_item = clean_data(processed_item)
|
||
|
processed_data_list.append(processed_item)
|
||
|
start_id += 1
|
||
|
except Exception as e:
|
||
|
# logger.warning(f"KeyError: {e} in item: {ori_item}")
|
||
|
continue
|
||
|
# 将列表保存为 .jsonl 文件,这一步是最早的数据过滤和格式整合结果
|
||
|
print(f"Total valid JSON objects: {len(processed_data_list)}")
|
||
|
print("正在写入处理后的文件,请稍等...")
|
||
|
with jsonlines.open(output_path, mode='w') as writer:
|
||
|
writer.write_all(processed_data_list)
|
||
|
print("写入完成!")
|
||
|
|
||
|
|
||
|
|
||
|
def clean_text(text):
|
||
|
"""清理字符串中的非法字符"""
|
||
|
if isinstance(text, str):
|
||
|
# 替换孤立的代理字符
|
||
|
return text.encode('utf-8', errors='replace').decode('utf-8')
|
||
|
return text
|
||
|
|
||
|
def clean_data(data):
|
||
|
if isinstance(data, dict):
|
||
|
return {key: clean_data(value) for key, value in data.items()}
|
||
|
elif isinstance(data, list):
|
||
|
return [clean_data(item) for item in data]
|
||
|
elif isinstance(data, str):
|
||
|
return clean_text(data)
|
||
|
return data
|