You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

66 lines
2.8 KiB

1 month ago
import glob
import json
from pathlib import Path
import time
from components.doubao_process import ask_question
# from components.jsonl_repair_reindex import process_json
def get_spilit_file_list(file_path):
# 递归匹配所有子目录中的 .txt 文件
recursive_txt_files = glob.glob(file_path+'/*.jsonl', recursive=True)
return recursive_txt_files
def process_jsonl_file(file_path,output_folder,logger):
base_prompt = '该json中的answer_detail不够详细,请改写为详细内容,格式要求为分步骤推导,格式为:"Step 1: ...\\nStep 2: ...\\nStep 3: ...;要求公式呈现 所有使用的公式需独立写出(如"F=ma"),不能隐含在文字中变量代入过程,展示具体数值代入公式的过程(如"9.8=5×a");latex公式使用双斜杠;返回具有id,answer_detail字段的新json,json不要换行,不需要其他回复'
temp_cache_0 = list(read_jsonl(file_path))
start_time = time.time()
for item in temp_cache_0:
quest_detail = process_question(item)
full_question = base_prompt+quest_detail
ai_response = ask_question(full_question)
logger.info(f"问题: {quest_detail}")
logger.info(f"回答: {ai_response}")
logger.info("===================================")
with open(file_path + '.replace', 'a', encoding='utf-8') as f:
# json_str = process_json(ai_response)
# json_line = json.dumps(ai_response, ensure_ascii=False)
f.write(ai_response + '\n')
end_time = time.time()
logger.info(f"处理时间: {end_time - start_time}")
def read_jsonl(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
yield json.loads(line.strip()) # 每行解析为字典 [[3]]
# 标准化处理函数 [[8]][[9]]
def process_question(json_line):
data = json_line
# 提取关键字段(根据实际JSON结构调整)
question = data.get('q_main', '')
answer = data.get('answer_detail', '')
# 文本压缩处理
# question = re.sub(r'\s+', ' ', question).strip()
# question = re.sub(r'(\d)\s+(\d)', r'\1\2', question) # 合并断裂数字
# question = re.sub(r'(?<![\d.])\d{4,}(?![\d.])',
# lambda m: f"{float(m.group()):.2e}", question) # 科学计数法
# 重组为最小化JSON结构
return json.dumps({
'id': data.get('id'),
'question': question,
'answer_detail': answer
}, ensure_ascii=False)
# 写入 JSONL 文件
def write_jsonl(file_path, data_list):
with open(file_path, 'w', encoding='utf-8') as f:
for item in data_list:
json_line = json.dumps(item, ensure_ascii=False) # 转换为JSON字符串 [[3]]
f.write(json_line + '\n') # 每行写入一个JSON对象 [[2]