# Start import json import jsonlines def load_ori_file(ori_path,question_type_list): ori_data_list = [] line_count = 0 with open(ori_path, 'r', encoding='utf-8') as f: for line in f: # 将每行解析为 JSON 对象 try: line_count += 1 data = json.loads(line.strip()) # 判断题型 # if data["type"] not in question_type_list: # continue # 简单回答题型过滤 # if is_easy_answer(str(data["answer"])): # continue # # 初步校验 if ori_data_validate(data): ori_data_list.append(data) except json.JSONDecodeError: print(f"Error decoding JSON on line {line_count}: {line.strip()}") line_count += 1 continue print(f"Total lines processed: {line_count}") return ori_data_list # 用于原始题目数据有效性的验证 def ori_data_validate(ori_data_str): try: if "如图" in ori_data_str: return False else: return True except Exception as e: print(e) return False # 判断是否为简单回答 def is_easy_answer(s): # 判断长度不超过5,并且包含中文字符 if len(s) > 5: return False # 使用正则表达式判断是否包含中文 import re if re.search(r'[\u4e00-\u9fa5]', s): return True return False def generate_processed_data(ori_data_list,major,output_path): start_id = 1 processed_data_list = [] difficulty_map = { '困难': '难', '高': '难', 'high': '难', '5': '难', '5星': '难', 'hard': '难', '中': '中', 'medium': '中', '4': '中', '3': '中', '4星': '中', '3星': '中', '低': '低', 'low': '低', '1': '低', '1星': '低', '2': '低', '2星': '低', 'easy': '低', } #major和major_2的映射关系 for ori_item in ori_data_list: try: # 处理难度 if ori_item['difficulty'] in difficulty_map: ori_item['difficulty'] = difficulty_map[ori_item['difficulty']] else: # 如果不匹配任何规则,可以选择跳过或记录日志 continue # 处理数据 processed_item = { "grade_class": "高等教育", "grade": "大学", "major": major, "major_2": ori_item['subject'], "language": "zh", "id": start_id, "q_main": ori_item['question'], "std_ans": ori_item['answer'], "answer_detail": ori_item['analyzing'], "hard_level": ori_item['difficulty'], "keypoint": ori_item['knowledge_point'], "q_type": ori_item['type'] } #清理html标签、不可见字符、异常字符 processed_item = clean_data(processed_item) processed_data_list.append(processed_item) start_id += 1 except Exception as e: # logger.warning(f"KeyError: {e} in item: {ori_item}") continue # 将列表保存为 .jsonl 文件,这一步是最早的数据过滤和格式整合结果 print(f"Total valid JSON objects: {len(processed_data_list)}") print("正在写入处理后的文件,请稍等...") with jsonlines.open(output_path, mode='w') as writer: writer.write_all(processed_data_list) print("写入完成!") def clean_text(text): """清理字符串中的非法字符""" if isinstance(text, str): # 替换孤立的代理字符 return text.encode('utf-8', errors='replace').decode('utf-8') return text def clean_data(data): if isinstance(data, dict): return {key: clean_data(value) for key, value in data.items()} elif isinstance(data, list): return [clean_data(item) for item in data] elif isinstance(data, str): return clean_text(data) return data