You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
2.5 KiB
69 lines
2.5 KiB
1 month ago
|
import logging
|
||
|
import os
|
||
|
from pathlib import Path
|
||
|
import sys
|
||
|
from components.ai_check import get_spilit_file_list, process_jsonl_file
|
||
|
from components.spilit import split_jsonl
|
||
|
from components.step1 import generate_processed_data, load_ori_file, ori_data_validate
|
||
|
from step4_major import start_process_major2
|
||
|
|
||
|
#输入信息
|
||
|
#完整文件路径
|
||
|
start_file_path = sys.argv[1]
|
||
|
major = sys.argv[2]
|
||
|
split_size = sys.argv[3]
|
||
|
|
||
|
# 获取文件名,文件路径
|
||
|
p = Path(start_file_path)
|
||
|
ori_file_name = str(p.name)
|
||
|
ori_file_path = str(p.parent)
|
||
|
ori_file_pure_name = str(p.stem)
|
||
|
question_type_list = ['填空题', '解答题']
|
||
|
|
||
|
# 初始化文件夹路径
|
||
|
os.makedirs(ori_file_path+"/transformed", exist_ok=True)
|
||
|
os.makedirs(ori_file_path+"/major2_processed", exist_ok=True)
|
||
|
os.makedirs(ori_file_path+"/spilited_ai1", exist_ok=True)
|
||
|
os.makedirs(ori_file_path+"/ai_1", exist_ok=True)
|
||
|
os.makedirs(ori_file_path+"/ai_2", exist_ok=True)
|
||
|
os.makedirs(ori_file_path+"/ai_2_total", exist_ok=True)
|
||
|
os.makedirs(ori_file_path+"/log", exist_ok=True)
|
||
|
|
||
|
# 配置日志记录器
|
||
|
logging.basicConfig(
|
||
|
level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别)
|
||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式
|
||
|
filename=ori_file_path+"/log"+ori_file_pure_name+'.log', # 将日志写入文件
|
||
|
filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加)
|
||
|
)
|
||
|
|
||
|
# 创建日志记录器
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
def process_wrapper(args):
|
||
|
ai1_file_path, output_dir, logger = args
|
||
|
process_jsonl_file(ai1_file_path, output_dir, logger)
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
# 非json数据过滤
|
||
|
# 题型过滤
|
||
|
# 【如图】关键字过滤
|
||
|
# 简单回答过滤
|
||
|
# filtered_data_list = load_ori_file(start_file_path,question_type_list)
|
||
|
# 数据格式转换
|
||
|
# generate_processed_data(filtered_data_list,major,ori_file_path+"/transformed/"+ori_file_pure_name+".processed")
|
||
|
# Major_2数据过滤
|
||
|
# start_process_major2(ori_file_path+"/transformed/"+ori_file_pure_name+".processed",ori_file_path+"/major2_processed/"+ori_file_pure_name+".processed",major)
|
||
|
# 文件大小分割
|
||
|
split_jsonl(ori_file_path+"/major2_processed/"+ori_file_pure_name+".processed",
|
||
|
ori_file_path+"/spilited_ai1/"+ori_file_pure_name, chunk_size=int(split_size))
|
||
|
# AI过滤第一遍
|
||
|
# ai1_file_list = get_spilit_file_list(ori_file_path+"/spilited/")
|
||
|
# for ai1_file_path in ai1_file_list:
|
||
|
# process_jsonl_file(ai1_file_path,ori_file_path+"/ai_1/",logger)
|
||
|
# AI过滤第二遍
|
||
|
|
||
|
|
||
|
# 整合结果和日志
|
||
|
|
||
|
|