QUE_REPLACE/components/step1.py


# Start
import json
import jsonlines


def load_ori_file(ori_path,question_type_list):
    ori_data_list = []
    line_count = 0
    with open(ori_path, 'r', encoding='utf-8') as f:
        for line in f:
            # 将每行解析为 JSON 对象
            try:
                line_count += 1
                data = json.loads(line.strip())
                # 判断题型
                # if data["type"] not in question_type_list:
                #     continue
                # 简单回答题型过滤
                # if is_easy_answer(str(data["answer"])):
                #     continue
                # # 初步校验
                if ori_data_validate(data):
                    ori_data_list.append(data)
            except json.JSONDecodeError:
                print(f"Error decoding JSON on line {line_count}: {line.strip()}")
                line_count += 1
                continue
    print(f"Total lines processed: {line_count}")
    return ori_data_list


# 用于原始题目数据有效性的验证
def ori_data_validate(ori_data_str):
    try:
        if "如图" in ori_data_str:
            return False
        else:
            return True    
    except Exception as e:
        print(e)
        return False
    
# 判断是否为简单回答
def is_easy_answer(s):
    # 判断长度不超过5，并且包含中文字符
    if len(s) > 5:
        return False
    # 使用正则表达式判断是否包含中文
    import re
    if re.search(r'[\u4e00-\u9fa5]', s):
        return True
    return False


def generate_processed_data(ori_data_list,major,output_path):
    start_id = 1
    processed_data_list = []
    difficulty_map = {
        '困难': '难',
        '高': '难',
        'high': '难',
        '5': '难',
        '5星': '难',
        'hard': '难',
        '中': '中',
        'medium': '中',
        '4': '中',
        '3': '中',
        '4星': '中',
        '3星': '中',
        '低': '低',
        'low': '低',
        '1': '低',
        '1星': '低',
        '2': '低',
        '2星': '低',
        'easy': '低',  
    }

    #major和major_2的映射关系
    for ori_item in ori_data_list:
        try:
            # 处理难度
            if ori_item['difficulty'] in difficulty_map:
                ori_item['difficulty'] = difficulty_map[ori_item['difficulty']]
            else:
                # 如果不匹配任何规则，可以选择跳过或记录日志
                continue
            # 处理数据
            processed_item = {
                "grade_class": "高等教育",
                "grade": "大学",
                "major": major,
                "major_2": ori_item['subject'],
                "language": "zh",
                "id": start_id,
                "q_main": ori_item['question'],
                "std_ans": ori_item['answer'],
                "answer_detail": ori_item['analyzing'],
                "hard_level": ori_item['difficulty'],
                "keypoint": ori_item['knowledge_point'],
                "q_type": ori_item['type']
            }
            #清理html标签、不可见字符、异常字符
            processed_item = clean_data(processed_item)
            processed_data_list.append(processed_item)
            start_id += 1
        except Exception as e:
            # logger.warning(f"KeyError: {e} in item: {ori_item}")
            continue
    # 将列表保存为 .jsonl 文件，这一步是最早的数据过滤和格式整合结果
    print(f"Total valid JSON objects: {len(processed_data_list)}")
    print("正在写入处理后的文件，请稍等...")
    with jsonlines.open(output_path, mode='w') as writer:
        writer.write_all(processed_data_list)
    print("写入完成！")
    

def clean_text(text):
    """清理字符串中的非法字符"""
    if isinstance(text, str):
        # 替换孤立的代理字符
        return text.encode('utf-8', errors='replace').decode('utf-8')
    return text

def clean_data(data):
    if isinstance(data, dict):
        return {key: clean_data(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [clean_data(item) for item in data]
    elif isinstance(data, str):
        return clean_text(data)
    return data
init 1 month ago
			`# Start`
			`import json`
			`import jsonlines`


			`def load_ori_file(ori_path,question_type_list):`
			`ori_data_list = []`
			`line_count = 0`
			`with open(ori_path, 'r', encoding='utf-8') as f:`
			`for line in f:`
			`# 将每行解析为 JSON 对象`
			`try:`
			`line_count += 1`
			`data = json.loads(line.strip())`
			`# 判断题型`
			`# if data["type"] not in question_type_list:`
			`# continue`
			`# 简单回答题型过滤`
			`# if is_easy_answer(str(data["answer"])):`
			`# continue`
			`# # 初步校验`
			`if ori_data_validate(data):`
			`ori_data_list.append(data)`
			`except json.JSONDecodeError:`
			`print(f"Error decoding JSON on line {line_count}: {line.strip()}")`
			`line_count += 1`
			`continue`
			`print(f"Total lines processed: {line_count}")`
			`return ori_data_list`


			`# 用于原始题目数据有效性的验证`
			`def ori_data_validate(ori_data_str):`
			`try:`
			`if "如图" in ori_data_str:`
			`return False`
			`else:`
			`return True`
			`except Exception as e:`
			`print(e)`
			`return False`

			`# 判断是否为简单回答`
			`def is_easy_answer(s):`
			`# 判断长度不超过5，并且包含中文字符`
			`if len(s) > 5:`
			`return False`
			`# 使用正则表达式判断是否包含中文`
			`import re`
			`if re.search(r'[\u4e00-\u9fa5]', s):`
			`return True`
			`return False`


			`def generate_processed_data(ori_data_list,major,output_path):`
			`start_id = 1`
			`processed_data_list = []`
			`difficulty_map = {`
			`'困难': '难',`
			`'高': '难',`
			`'high': '难',`
			`'5': '难',`
			`'5星': '难',`
			`'hard': '难',`
			`'中': '中',`
			`'medium': '中',`
			`'4': '中',`
			`'3': '中',`
			`'4星': '中',`
			`'3星': '中',`
			`'低': '低',`
			`'low': '低',`
			`'1': '低',`
			`'1星': '低',`
			`'2': '低',`
			`'2星': '低',`
			`'easy': '低',`
			`}`

			`#major和major_2的映射关系`
			`for ori_item in ori_data_list:`
			`try:`
			`# 处理难度`
			`if ori_item['difficulty'] in difficulty_map:`
			`ori_item['difficulty'] = difficulty_map[ori_item['difficulty']]`
			`else:`
			`# 如果不匹配任何规则，可以选择跳过或记录日志`
			`continue`
			`# 处理数据`
			`processed_item = {`
			`"grade_class": "高等教育",`
			`"grade": "大学",`
			`"major": major,`
			`"major_2": ori_item['subject'],`
			`"language": "zh",`
			`"id": start_id,`
			`"q_main": ori_item['question'],`
			`"std_ans": ori_item['answer'],`
			`"answer_detail": ori_item['analyzing'],`
			`"hard_level": ori_item['difficulty'],`
			`"keypoint": ori_item['knowledge_point'],`
			`"q_type": ori_item['type']`
			`}`
			`#清理html标签、不可见字符、异常字符`
			`processed_item = clean_data(processed_item)`
			`processed_data_list.append(processed_item)`
			`start_id += 1`
			`except Exception as e:`
			`# logger.warning(f"KeyError: {e} in item: {ori_item}")`
			`continue`
			`# 将列表保存为 .jsonl 文件，这一步是最早的数据过滤和格式整合结果`
			`print(f"Total valid JSON objects: {len(processed_data_list)}")`
			`print("正在写入处理后的文件，请稍等...")`
			`with jsonlines.open(output_path, mode='w') as writer:`
			`writer.write_all(processed_data_list)`
			`print("写入完成！")`



			`def clean_text(text):`
			`"""清理字符串中的非法字符"""`
			`if isinstance(text, str):`
			`# 替换孤立的代理字符`
			`return text.encode('utf-8', errors='replace').decode('utf-8')`
			`return text`

			`def clean_data(data):`
			`if isinstance(data, dict):`
			`return {key: clean_data(value) for key, value in data.items()}`
			`elif isinstance(data, list):`
			`return [clean_data(item) for item in data]`
			`elif isinstance(data, str):`
			`return clean_text(data)`
			`return data`