You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
66 lines
2.8 KiB
66 lines
2.8 KiB
import glob
|
|
import json
|
|
from pathlib import Path
|
|
import time
|
|
|
|
from components.doubao_process import ask_question
|
|
# from components.jsonl_repair_reindex import process_json
|
|
|
|
def get_spilit_file_list(file_path):
|
|
# 递归匹配所有子目录中的 .txt 文件
|
|
recursive_txt_files = glob.glob(file_path+'/*.jsonl', recursive=True)
|
|
return recursive_txt_files
|
|
|
|
|
|
def process_jsonl_file(file_path,output_folder,logger):
|
|
base_prompt = '该json中的answer_detail不够详细,请改写为详细内容,格式要求为分步骤推导,格式为:"Step 1: ...\\nStep 2: ...\\nStep 3: ...;要求公式呈现 所有使用的公式需独立写出(如"F=ma"),不能隐含在文字中变量代入过程,展示具体数值代入公式的过程(如"9.8=5×a");latex公式使用双斜杠;返回具有id,answer_detail字段的新json,json不要换行,不需要其他回复'
|
|
temp_cache_0 = list(read_jsonl(file_path))
|
|
|
|
start_time = time.time()
|
|
for item in temp_cache_0:
|
|
quest_detail = process_question(item)
|
|
full_question = base_prompt+quest_detail
|
|
ai_response = ask_question(full_question)
|
|
logger.info(f"问题: {quest_detail}")
|
|
logger.info(f"回答: {ai_response}")
|
|
logger.info("===================================")
|
|
with open(file_path + '.replace', 'a', encoding='utf-8') as f:
|
|
# json_str = process_json(ai_response)
|
|
# json_line = json.dumps(ai_response, ensure_ascii=False)
|
|
f.write(ai_response + '\n')
|
|
|
|
end_time = time.time()
|
|
logger.info(f"处理时间: {end_time - start_time}秒")
|
|
def read_jsonl(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
yield json.loads(line.strip()) # 每行解析为字典 [[3]]
|
|
|
|
|
|
# 标准化处理函数 [[8]][[9]]
|
|
def process_question(json_line):
|
|
data = json_line
|
|
# 提取关键字段(根据实际JSON结构调整)
|
|
question = data.get('q_main', '')
|
|
answer = data.get('answer_detail', '')
|
|
|
|
# 文本压缩处理
|
|
# question = re.sub(r'\s+', ' ', question).strip()
|
|
# question = re.sub(r'(\d)\s+(\d)', r'\1\2', question) # 合并断裂数字
|
|
# question = re.sub(r'(?<![\d.])\d{4,}(?![\d.])',
|
|
# lambda m: f"{float(m.group()):.2e}", question) # 科学计数法
|
|
|
|
# 重组为最小化JSON结构
|
|
return json.dumps({
|
|
'id': data.get('id'),
|
|
'question': question,
|
|
'answer_detail': answer
|
|
}, ensure_ascii=False)
|
|
|
|
# 写入 JSONL 文件
|
|
def write_jsonl(file_path, data_list):
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
for item in data_list:
|
|
json_line = json.dumps(item, ensure_ascii=False) # 转换为JSON字符串 [[3]]
|
|
f.write(json_line + '\n') # 每行写入一个JSON对象 [[2]
|
|
|