# Start
import json
import jsonlines


def load_ori_file(ori_path,question_type_list):
    ori_data_list = []
    line_count = 0
    with open(ori_path, 'r', encoding='utf-8') as f:
        for line in f:
            # 将每行解析为 JSON 对象
            try:
                line_count += 1
                data = json.loads(line.strip())
                # 判断题型
                # if data["type"] not in question_type_list:
                #     continue
                # 简单回答题型过滤
                # if is_easy_answer(str(data["answer"])):
                #     continue
                # # 初步校验
                if ori_data_validate(data):
                    ori_data_list.append(data)
            except json.JSONDecodeError:
                print(f"Error decoding JSON on line {line_count}: {line.strip()}")
                line_count += 1
                continue
    print(f"Total lines processed: {line_count}")
    return ori_data_list


# 用于原始题目数据有效性的验证
def ori_data_validate(ori_data_str):
    try:
        if "如图" in ori_data_str:
            return False
        else:
            return True    
    except Exception as e:
        print(e)
        return False
    
# 判断是否为简单回答
def is_easy_answer(s):
    # 判断长度不超过5，并且包含中文字符
    if len(s) > 5:
        return False
    # 使用正则表达式判断是否包含中文
    import re
    if re.search(r'[\u4e00-\u9fa5]', s):
        return True
    return False


def generate_processed_data(ori_data_list,major,output_path):
    start_id = 1
    processed_data_list = []
    difficulty_map = {
        '困难': '难',
        '高': '难',
        'high': '难',
        '5': '难',
        '5星': '难',
        'hard': '难',
        '中': '中',
        'medium': '中',
        '4': '中',
        '3': '中',
        '4星': '中',
        '3星': '中',
        '低': '低',
        'low': '低',
        '1': '低',
        '1星': '低',
        '2': '低',
        '2星': '低',
        'easy': '低',  
    }

    #major和major_2的映射关系
    for ori_item in ori_data_list:
        try:
            # 处理难度
            if ori_item['difficulty'] in difficulty_map:
                ori_item['difficulty'] = difficulty_map[ori_item['difficulty']]
            else:
                # 如果不匹配任何规则，可以选择跳过或记录日志
                continue
            # 处理数据
            processed_item = {
                "grade_class": "高等教育",
                "grade": "大学",
                "major": major,
                "major_2": ori_item['subject'],
                "language": "zh",
                "id": start_id,
                "q_main": ori_item['question'],
                "std_ans": ori_item['answer'],
                "answer_detail": ori_item['analyzing'],
                "hard_level": ori_item['difficulty'],
                "keypoint": ori_item['knowledge_point'],
                "q_type": ori_item['type']
            }
            #清理html标签、不可见字符、异常字符
            processed_item = clean_data(processed_item)
            processed_data_list.append(processed_item)
            start_id += 1
        except Exception as e:
            # logger.warning(f"KeyError: {e} in item: {ori_item}")
            continue
    # 将列表保存为 .jsonl 文件，这一步是最早的数据过滤和格式整合结果
    print(f"Total valid JSON objects: {len(processed_data_list)}")
    print("正在写入处理后的文件，请稍等...")
    with jsonlines.open(output_path, mode='w') as writer:
        writer.write_all(processed_data_list)
    print("写入完成！")
    

def clean_text(text):
    """清理字符串中的非法字符"""
    if isinstance(text, str):
        # 替换孤立的代理字符
        return text.encode('utf-8', errors='replace').decode('utf-8')
    return text

def clean_data(data):
    if isinstance(data, dict):
        return {key: clean_data(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [clean_data(item) for item in data]
    elif isinstance(data, str):
        return clean_text(data)
    return data