import glob import json from pathlib import Path import time from components.doubao_process import ask_question # from components.jsonl_repair_reindex import process_json def get_spilit_file_list(file_path): # 递归匹配所有子目录中的 .txt 文件 recursive_txt_files = glob.glob(file_path+'/*.jsonl', recursive=True) return recursive_txt_files def process_jsonl_file(file_path,output_folder,logger): base_prompt = '该json中的answer_detail不够详细,请改写为详细内容,格式要求为分步骤推导,格式为:"Step 1: ...\\nStep 2: ...\\nStep 3: ...;要求公式呈现 所有使用的公式需独立写出(如"F=ma"),不能隐含在文字中变量代入过程,展示具体数值代入公式的过程(如"9.8=5×a");latex公式使用双斜杠;返回具有id,answer_detail字段的新json,json不要换行,不需要其他回复' temp_cache_0 = list(read_jsonl(file_path)) start_time = time.time() for item in temp_cache_0: quest_detail = process_question(item) full_question = base_prompt+quest_detail ai_response = ask_question(full_question) logger.info(f"问题: {quest_detail}") logger.info(f"回答: {ai_response}") logger.info("===================================") with open(file_path + '.replace', 'a', encoding='utf-8') as f: # json_str = process_json(ai_response) # json_line = json.dumps(ai_response, ensure_ascii=False) f.write(ai_response + '\n') end_time = time.time() logger.info(f"处理时间: {end_time - start_time}秒") def read_jsonl(file_path): with open(file_path, 'r', encoding='utf-8') as f: for line in f: yield json.loads(line.strip()) # 每行解析为字典 [[3]] # 标准化处理函数 [[8]][[9]] def process_question(json_line): data = json_line # 提取关键字段(根据实际JSON结构调整) question = data.get('q_main', '') answer = data.get('answer_detail', '') # 文本压缩处理 # question = re.sub(r'\s+', ' ', question).strip() # question = re.sub(r'(\d)\s+(\d)', r'\1\2', question) # 合并断裂数字 # question = re.sub(r'(?