import logging import os from pathlib import Path import sys from components.ai_check import get_spilit_file_list, process_jsonl_file from components.spilit import split_jsonl from components.step1 import generate_processed_data, load_ori_file, ori_data_validate from step4_major import start_process_major2 #输入信息 #完整文件路径 start_file_path = sys.argv[1] major = sys.argv[2] split_size = sys.argv[3] # 获取文件名,文件路径 p = Path(start_file_path) ori_file_name = str(p.name) ori_file_path = str(p.parent) ori_file_pure_name = str(p.stem) question_type_list = ['填空题', '解答题'] # 初始化文件夹路径 os.makedirs(ori_file_path+"/transformed", exist_ok=True) os.makedirs(ori_file_path+"/major2_processed", exist_ok=True) os.makedirs(ori_file_path+"/spilited_ai1", exist_ok=True) os.makedirs(ori_file_path+"/ai_1", exist_ok=True) os.makedirs(ori_file_path+"/ai_2", exist_ok=True) os.makedirs(ori_file_path+"/ai_2_total", exist_ok=True) os.makedirs(ori_file_path+"/log", exist_ok=True) # 配置日志记录器 logging.basicConfig( level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别) format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式 filename=ori_file_path+"/log"+ori_file_pure_name+'.log', # 将日志写入文件 filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加) ) # 创建日志记录器 logger = logging.getLogger(__name__) def process_wrapper(args): ai1_file_path, output_dir, logger = args process_jsonl_file(ai1_file_path, output_dir, logger) if __name__ == '__main__': # 非json数据过滤 # 题型过滤 # 【如图】关键字过滤 # 简单回答过滤 # filtered_data_list = load_ori_file(start_file_path,question_type_list) # 数据格式转换 # generate_processed_data(filtered_data_list,major,ori_file_path+"/transformed/"+ori_file_pure_name+".processed") # Major_2数据过滤 # start_process_major2(ori_file_path+"/transformed/"+ori_file_pure_name+".processed",ori_file_path+"/major2_processed/"+ori_file_pure_name+".processed",major) # 文件大小分割 split_jsonl(ori_file_path+"/major2_processed/"+ori_file_pure_name+".processed", ori_file_path+"/spilited_ai1/"+ori_file_pure_name, chunk_size=int(split_size)) # AI过滤第一遍 # ai1_file_list = get_spilit_file_list(ori_file_path+"/spilited/") # for ai1_file_path in ai1_file_list: # process_jsonl_file(ai1_file_path,ori_file_path+"/ai_1/",logger) # AI过滤第二遍 # 整合结果和日志