QUE_REPLACE/components/step1.py



								# Start

								import json

								import jsonlines


								def load_ori_file(ori_path,question_type_list):

								    ori_data_list = []

								    line_count = 0

								    with open(ori_path, 'r', encoding='utf-8') as f:

								        for line in f:

								            # 将每行解析为 JSON 对象

								            try:

								                line_count += 1

								                data = json.loads(line.strip())

								                # 判断题型

								                # if data["type"] not in question_type_list:

								                #     continue

								                # 简单回答题型过滤

								                # if is_easy_answer(str(data["answer"])):

								                #     continue

								                # # 初步校验

								                if ori_data_validate(data):

								                    ori_data_list.append(data)

								            except json.JSONDecodeError:

								                print(f"Error decoding JSON on line {line_count}: {line.strip()}")

								                line_count += 1

								                continue

								    print(f"Total lines processed: {line_count}")

								    return ori_data_list


								# 用于原始题目数据有效性的验证

								def ori_data_validate(ori_data_str):

								    try:

								        if "如图" in ori_data_str:

								            return False

								        else:

								            return True

								    except Exception as e:

								        print(e)

								        return False


								# 判断是否为简单回答

								def is_easy_answer(s):

								    # 判断长度不超过5，并且包含中文字符

								    if len(s) > 5:

								        return False

								    # 使用正则表达式判断是否包含中文

								    import re

								    if re.search(r'[\u4e00-\u9fa5]', s):

								        return True

								    return False


								def generate_processed_data(ori_data_list,major,output_path):

								    start_id = 1

								    processed_data_list = []

								    difficulty_map = {

								        '困难': '难',

								        '高': '难',

								        'high': '难',

								        '5': '难',

								        '5星': '难',

								        'hard': '难',

								        '中': '中',

								        'medium': '中',

								        '4': '中',

								        '3': '中',

								        '4星': '中',

								        '3星': '中',

								        '低': '低',

								        'low': '低',

								        '1': '低',

								        '1星': '低',

								        '2': '低',

								        '2星': '低',

								        'easy': '低',

								    }


								    #major和major_2的映射关系

								    for ori_item in ori_data_list:

								        try:

								            # 处理难度

								            if ori_item['difficulty'] in difficulty_map:

								                ori_item['difficulty'] = difficulty_map[ori_item['difficulty']]

								            else:

								                # 如果不匹配任何规则，可以选择跳过或记录日志

								                continue

								            # 处理数据

								            processed_item = {

								                "grade_class": "高等教育",

								                "grade": "大学",

								                "major": major,

								                "major_2": ori_item['subject'],

								                "language": "zh",

								                "id": start_id,

								                "q_main": ori_item['question'],

								                "std_ans": ori_item['answer'],

								                "answer_detail": ori_item['analyzing'],

								                "hard_level": ori_item['difficulty'],

								                "keypoint": ori_item['knowledge_point'],

								                "q_type": ori_item['type']

								            }

								            #清理html标签、不可见字符、异常字符

								            processed_item = clean_data(processed_item)

								            processed_data_list.append(processed_item)

								            start_id += 1

								        except Exception as e:

								            # logger.warning(f"KeyError: {e} in item: {ori_item}")

								            continue

								    # 将列表保存为 .jsonl 文件，这一步是最早的数据过滤和格式整合结果

								    print(f"Total valid JSON objects: {len(processed_data_list)}")

								    print("正在写入处理后的文件，请稍等...")

								    with jsonlines.open(output_path, mode='w') as writer:

								        writer.write_all(processed_data_list)

								    print("写入完成！")


								def clean_text(text):

								    """清理字符串中的非法字符"""

								    if isinstance(text, str):

								        # 替换孤立的代理字符

								        return text.encode('utf-8', errors='replace').decode('utf-8')

								    return text


								def clean_data(data):

								    if isinstance(data, dict):

								        return {key: clean_data(value) for key, value in data.items()}

								    elif isinstance(data, list):

								        return [clean_data(item) for item in data]

								    elif isinstance(data, str):

								        return clean_text(data)

								    return data