You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

135 lines
4.2 KiB

# Start
import json
import jsonlines
def load_ori_file(ori_path,question_type_list):
ori_data_list = []
line_count = 0
with open(ori_path, 'r', encoding='utf-8') as f:
for line in f:
# 将每行解析为 JSON 对象
try:
line_count += 1
data = json.loads(line.strip())
# 判断题型
# if data["type"] not in question_type_list:
# continue
# 简单回答题型过滤
# if is_easy_answer(str(data["answer"])):
# continue
# # 初步校验
if ori_data_validate(data):
ori_data_list.append(data)
except json.JSONDecodeError:
print(f"Error decoding JSON on line {line_count}: {line.strip()}")
line_count += 1
continue
print(f"Total lines processed: {line_count}")
return ori_data_list
# 用于原始题目数据有效性的验证
def ori_data_validate(ori_data_str):
try:
if "如图" in ori_data_str:
return False
else:
return True
except Exception as e:
print(e)
return False
# 判断是否为简单回答
def is_easy_answer(s):
# 判断长度不超过5,并且包含中文字符
if len(s) > 5:
return False
# 使用正则表达式判断是否包含中文
import re
if re.search(r'[\u4e00-\u9fa5]', s):
return True
return False
def generate_processed_data(ori_data_list,major,output_path):
start_id = 1
processed_data_list = []
difficulty_map = {
'困难': '',
'': '',
'high': '',
'5': '',
'5星': '',
'hard': '',
'': '',
'medium': '',
'4': '',
'3': '',
'4星': '',
'3星': '',
'': '',
'low': '',
'1': '',
'1星': '',
'2': '',
'2星': '',
'easy': '',
}
#major和major_2的映射关系
for ori_item in ori_data_list:
try:
# 处理难度
if ori_item['difficulty'] in difficulty_map:
ori_item['difficulty'] = difficulty_map[ori_item['difficulty']]
else:
# 如果不匹配任何规则,可以选择跳过或记录日志
continue
# 处理数据
processed_item = {
"grade_class": "高等教育",
"grade": "大学",
"major": major,
"major_2": ori_item['subject'],
"language": "zh",
"id": start_id,
"q_main": ori_item['question'],
"std_ans": ori_item['answer'],
"answer_detail": ori_item['analyzing'],
"hard_level": ori_item['difficulty'],
"keypoint": ori_item['knowledge_point'],
"q_type": ori_item['type']
}
#清理html标签、不可见字符、异常字符
processed_item = clean_data(processed_item)
processed_data_list.append(processed_item)
start_id += 1
except Exception as e:
# logger.warning(f"KeyError: {e} in item: {ori_item}")
continue
# 将列表保存为 .jsonl 文件,这一步是最早的数据过滤和格式整合结果
print(f"Total valid JSON objects: {len(processed_data_list)}")
print("正在写入处理后的文件,请稍等...")
with jsonlines.open(output_path, mode='w') as writer:
writer.write_all(processed_data_list)
print("写入完成!")
def clean_text(text):
"""清理字符串中的非法字符"""
if isinstance(text, str):
# 替换孤立的代理字符
return text.encode('utf-8', errors='replace').decode('utf-8')
return text
def clean_data(data):
if isinstance(data, dict):
return {key: clean_data(value) for key, value in data.items()}
elif isinstance(data, list):
return [clean_data(item) for item in data]
elif isinstance(data, str):
return clean_text(data)
return data