You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.5 KiB

1 month ago
import logging
import os
from pathlib import Path
import sys
from components.ai_check import get_spilit_file_list, process_jsonl_file
from components.spilit import split_jsonl
from components.step1 import generate_processed_data, load_ori_file, ori_data_validate
from step4_major import start_process_major2
#输入信息
#完整文件路径
start_file_path = sys.argv[1]
major = sys.argv[2]
split_size = sys.argv[3]
# 获取文件名,文件路径
p = Path(start_file_path)
ori_file_name = str(p.name)
ori_file_path = str(p.parent)
ori_file_pure_name = str(p.stem)
question_type_list = ['填空题', '解答题']
# 初始化文件夹路径
os.makedirs(ori_file_path+"/transformed", exist_ok=True)
os.makedirs(ori_file_path+"/major2_processed", exist_ok=True)
os.makedirs(ori_file_path+"/spilited_ai1", exist_ok=True)
os.makedirs(ori_file_path+"/ai_1", exist_ok=True)
os.makedirs(ori_file_path+"/ai_2", exist_ok=True)
os.makedirs(ori_file_path+"/ai_2_total", exist_ok=True)
os.makedirs(ori_file_path+"/log", exist_ok=True)
# 配置日志记录器
logging.basicConfig(
level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别)
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式
filename=ori_file_path+"/log"+ori_file_pure_name+'.log', # 将日志写入文件
filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加)
)
# 创建日志记录器
logger = logging.getLogger(__name__)
def process_wrapper(args):
ai1_file_path, output_dir, logger = args
process_jsonl_file(ai1_file_path, output_dir, logger)
if __name__ == '__main__':
# 非json数据过滤
# 题型过滤
# 【如图】关键字过滤
# 简单回答过滤
# filtered_data_list = load_ori_file(start_file_path,question_type_list)
# 数据格式转换
# generate_processed_data(filtered_data_list,major,ori_file_path+"/transformed/"+ori_file_pure_name+".processed")
# Major_2数据过滤
# start_process_major2(ori_file_path+"/transformed/"+ori_file_pure_name+".processed",ori_file_path+"/major2_processed/"+ori_file_pure_name+".processed",major)
# 文件大小分割
split_jsonl(ori_file_path+"/major2_processed/"+ori_file_pure_name+".processed",
ori_file_path+"/spilited_ai1/"+ori_file_pure_name, chunk_size=int(split_size))
# AI过滤第一遍
# ai1_file_list = get_spilit_file_list(ori_file_path+"/spilited/")
# for ai1_file_path in ai1_file_list:
# process_jsonl_file(ai1_file_path,ori_file_path+"/ai_1/",logger)
# AI过滤第二遍
# 整合结果和日志