QUE_REPLACE/components/ai_check.py

import glob
import json
from pathlib import Path
import time

from components.doubao_process import ask_question
# from components.jsonl_repair_reindex import process_json

def get_spilit_file_list(file_path):
    # 递归匹配所有子目录中的 .txt 文件
    recursive_txt_files = glob.glob(file_path+'/*.jsonl', recursive=True)
    return recursive_txt_files


def process_jsonl_file(file_path,output_folder,logger):
    base_prompt = '该json中的answer_detail不够详细,请改写为详细内容,格式要求为分步骤推导，格式为："Step 1: ...\\nStep 2: ...\\nStep 3: ...;要求公式呈现 所有使用的公式需独立写出（如"F=ma"），不能隐含在文字中变量代入过程,展示具体数值代入公式的过程（如"9.8=5×a"）;latex公式使用双斜杠;返回具有id,answer_detail字段的新json,json不要换行,不需要其他回复'
    temp_cache_0 = list(read_jsonl(file_path))
    
    start_time = time.time()
    for item in temp_cache_0:
        quest_detail = process_question(item)
        full_question = base_prompt+quest_detail
        ai_response = ask_question(full_question)
        logger.info(f"问题: {quest_detail}")
        logger.info(f"回答: {ai_response}")
        logger.info("===================================")
        with open(file_path + '.replace', 'a', encoding='utf-8') as f:
            # json_str = process_json(ai_response)
            # json_line = json.dumps(ai_response, ensure_ascii=False)
            f.write(ai_response + '\n')
            
    end_time = time.time()   
    logger.info(f"处理时间: {end_time - start_time}秒") 
def read_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            yield json.loads(line.strip())  # 每行解析为字典 [[3]]


# 标准化处理函数 [[8]][[9]]
def process_question(json_line):
    data = json_line
    # 提取关键字段（根据实际JSON结构调整）
    question = data.get('q_main', '')
    answer = data.get('answer_detail', '')
    
    # 文本压缩处理
    # question = re.sub(r'\s+', ' ', question).strip()
    # question = re.sub(r'(\d)\s+(\d)', r'\1\2', question)  # 合并断裂数字
    # question = re.sub(r'(?<![\d.])\d{4,}(?![\d.])', 
    #                  lambda m: f"{float(m.group()):.2e}", question)  # 科学计数法
    
    # 重组为最小化JSON结构
    return json.dumps({
        'id': data.get('id'),
        'question': question,
        'answer_detail': answer
    }, ensure_ascii=False)
    
# 写入 JSONL 文件
def write_jsonl(file_path, data_list):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data_list:
            json_line = json.dumps(item, ensure_ascii=False)  # 转换为JSON字符串 [[3]]
            f.write(json_line + '\n')  # 每行写入一个JSON对象 [[2]
init 1 month ago			`import glob`
			`import json`
			`from pathlib import Path`
			`import time`

			`from components.doubao_process import ask_question`
			`# from components.jsonl_repair_reindex import process_json`

			`def get_spilit_file_list(file_path):`
			`# 递归匹配所有子目录中的 .txt 文件`
			`recursive_txt_files = glob.glob(file_path+'/*.jsonl', recursive=True)`
			`return recursive_txt_files`


			`def process_jsonl_file(file_path,output_folder,logger):`
			`base_prompt = '该json中的answer_detail不够详细,请改写为详细内容,格式要求为分步骤推导，格式为："Step 1: ...\\nStep 2: ...\\nStep 3: ...;要求公式呈现所有使用的公式需独立写出（如"F=ma"），不能隐含在文字中变量代入过程,展示具体数值代入公式的过程（如"9.8=5×a"）;latex公式使用双斜杠;返回具有id,answer_detail字段的新json,json不要换行,不需要其他回复'`
			`temp_cache_0 = list(read_jsonl(file_path))`

			`start_time = time.time()`
			`for item in temp_cache_0:`
			`quest_detail = process_question(item)`
			`full_question = base_prompt+quest_detail`
			`ai_response = ask_question(full_question)`
			`logger.info(f"问题: {quest_detail}")`
			`logger.info(f"回答: {ai_response}")`
			`logger.info("===================================")`
			`with open(file_path + '.replace', 'a', encoding='utf-8') as f:`
			`# json_str = process_json(ai_response)`
			`# json_line = json.dumps(ai_response, ensure_ascii=False)`
			`f.write(ai_response + '\n')`

			`end_time = time.time()`
			`logger.info(f"处理时间: {end_time - start_time}秒")`
			`def read_jsonl(file_path):`
			`with open(file_path, 'r', encoding='utf-8') as f:`
			`for line in f:`
			`yield json.loads(line.strip()) # 每行解析为字典 [[3]]`


			`# 标准化处理函数 [[8]][[9]]`
			`def process_question(json_line):`
			`data = json_line`
			`# 提取关键字段（根据实际JSON结构调整）`
			`question = data.get('q_main', '')`
			`answer = data.get('answer_detail', '')`

			`# 文本压缩处理`
			`# question = re.sub(r'\s+', ' ', question).strip()`
			`# question = re.sub(r'(\d)\s+(\d)', r'\1\2', question) # 合并断裂数字`
			`# question = re.sub(r'(?<![\d.])\d{4,}(?![\d.])',`
			`# lambda m: f"{float(m.group()):.2e}", question) # 科学计数法`

			`# 重组为最小化JSON结构`
			`return json.dumps({`
			`'id': data.get('id'),`
			`'question': question,`
			`'answer_detail': answer`
			`}, ensure_ascii=False)`

			`# 写入 JSONL 文件`
			`def write_jsonl(file_path, data_list):`
			`with open(file_path, 'w', encoding='utf-8') as f:`
			`for item in data_list:`
			`json_line = json.dumps(item, ensure_ascii=False) # 转换为JSON字符串 [[3]]`
			`f.write(json_line + '\n') # 每行写入一个JSON对象 [[2]`