commit 061fa4d3f1020c4ba0268c0135a1f73914987b16 Author: jiangAB Date: Fri Jun 13 17:09:49 2025 +0800 init diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..f21236e Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ac3693 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +work_data +venv \ No newline at end of file diff --git a/__pycache__/step4_major.cpython-313.pyc b/__pycache__/step4_major.cpython-313.pyc new file mode 100644 index 0000000..bb9da5f Binary files /dev/null and b/__pycache__/step4_major.cpython-313.pyc differ diff --git a/asd.py b/asd.py new file mode 100644 index 0000000..0f75d86 --- /dev/null +++ b/asd.py @@ -0,0 +1,3 @@ +import json +from components.jsonl_repair_reindex import process_jsonl +process_jsonl('./huaxue_en.jsonl', './huaxue_en.jsonl.jsonl') diff --git a/components/.DS_Store b/components/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/components/.DS_Store differ diff --git a/components/__init__.py b/components/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/components/__pycache__/__init__.cpython-313.pyc b/components/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000..4e4d4df Binary files /dev/null and b/components/__pycache__/__init__.cpython-313.pyc differ diff --git a/components/__pycache__/ai_check.cpython-313.pyc b/components/__pycache__/ai_check.cpython-313.pyc new file mode 100644 index 0000000..07cd267 Binary files /dev/null and b/components/__pycache__/ai_check.cpython-313.pyc differ diff --git a/components/__pycache__/doubao_process.cpython-313.pyc b/components/__pycache__/doubao_process.cpython-313.pyc new file mode 100644 index 0000000..bb53496 Binary files /dev/null and b/components/__pycache__/doubao_process.cpython-313.pyc differ diff --git a/components/__pycache__/jsonl_repair_reindex.cpython-313.pyc b/components/__pycache__/jsonl_repair_reindex.cpython-313.pyc new file mode 100644 index 0000000..7875e18 Binary files /dev/null and b/components/__pycache__/jsonl_repair_reindex.cpython-313.pyc differ diff --git a/components/__pycache__/spilit.cpython-313.pyc b/components/__pycache__/spilit.cpython-313.pyc new file mode 100644 index 0000000..efee036 Binary files /dev/null and b/components/__pycache__/spilit.cpython-313.pyc differ diff --git a/components/__pycache__/step1.cpython-313.pyc b/components/__pycache__/step1.cpython-313.pyc new file mode 100644 index 0000000..6151aa3 Binary files /dev/null and b/components/__pycache__/step1.cpython-313.pyc differ diff --git a/components/ai_check.py b/components/ai_check.py new file mode 100644 index 0000000..8d20f64 --- /dev/null +++ b/components/ai_check.py @@ -0,0 +1,66 @@ +import glob +import json +from pathlib import Path +import time + +from components.doubao_process import ask_question +# from components.jsonl_repair_reindex import process_json + +def get_spilit_file_list(file_path): + # 递归匹配所有子目录中的 .txt 文件 + recursive_txt_files = glob.glob(file_path+'/*.jsonl', recursive=True) + return recursive_txt_files + + +def process_jsonl_file(file_path,output_folder,logger): + base_prompt = '该json中的answer_detail不够详细,请改写为详细内容,格式要求为分步骤推导,格式为:"Step 1: ...\\nStep 2: ...\\nStep 3: ...;要求公式呈现 所有使用的公式需独立写出(如"F=ma"),不能隐含在文字中变量代入过程,展示具体数值代入公式的过程(如"9.8=5×a");latex公式使用双斜杠;返回具有id,answer_detail字段的新json,json不要换行,不需要其他回复' + temp_cache_0 = list(read_jsonl(file_path)) + + start_time = time.time() + for item in temp_cache_0: + quest_detail = process_question(item) + full_question = base_prompt+quest_detail + ai_response = ask_question(full_question) + logger.info(f"问题: {quest_detail}") + logger.info(f"回答: {ai_response}") + logger.info("===================================") + with open(file_path + '.replace', 'a', encoding='utf-8') as f: + # json_str = process_json(ai_response) + # json_line = json.dumps(ai_response, ensure_ascii=False) + f.write(ai_response + '\n') + + end_time = time.time() + logger.info(f"处理时间: {end_time - start_time}秒") +def read_jsonl(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + yield json.loads(line.strip()) # 每行解析为字典 [[3]] + + +# 标准化处理函数 [[8]][[9]] +def process_question(json_line): + data = json_line + # 提取关键字段(根据实际JSON结构调整) + question = data.get('q_main', '') + answer = data.get('answer_detail', '') + + # 文本压缩处理 + # question = re.sub(r'\s+', ' ', question).strip() + # question = re.sub(r'(\d)\s+(\d)', r'\1\2', question) # 合并断裂数字 + # question = re.sub(r'(? 5: + return False + # 使用正则表达式判断是否包含中文 + import re + if re.search(r'[\u4e00-\u9fa5]', s): + return True + return False + + +def generate_processed_data(ori_data_list,major,output_path): + start_id = 1 + processed_data_list = [] + difficulty_map = { + '困难': '难', + '高': '难', + 'high': '难', + '5': '难', + '5星': '难', + 'hard': '难', + '中': '中', + 'medium': '中', + '4': '中', + '3': '中', + '4星': '中', + '3星': '中', + '低': '低', + 'low': '低', + '1': '低', + '1星': '低', + '2': '低', + '2星': '低', + 'easy': '低', + } + + #major和major_2的映射关系 + for ori_item in ori_data_list: + try: + # 处理难度 + if ori_item['difficulty'] in difficulty_map: + ori_item['difficulty'] = difficulty_map[ori_item['difficulty']] + else: + # 如果不匹配任何规则,可以选择跳过或记录日志 + continue + # 处理数据 + processed_item = { + "grade_class": "高等教育", + "grade": "大学", + "major": major, + "major_2": ori_item['subject'], + "language": "zh", + "id": start_id, + "q_main": ori_item['question'], + "std_ans": ori_item['answer'], + "answer_detail": ori_item['analyzing'], + "hard_level": ori_item['difficulty'], + "keypoint": ori_item['knowledge_point'], + "q_type": ori_item['type'] + } + #清理html标签、不可见字符、异常字符 + processed_item = clean_data(processed_item) + processed_data_list.append(processed_item) + start_id += 1 + except Exception as e: + # logger.warning(f"KeyError: {e} in item: {ori_item}") + continue + # 将列表保存为 .jsonl 文件,这一步是最早的数据过滤和格式整合结果 + print(f"Total valid JSON objects: {len(processed_data_list)}") + print("正在写入处理后的文件,请稍等...") + with jsonlines.open(output_path, mode='w') as writer: + writer.write_all(processed_data_list) + print("写入完成!") + + + +def clean_text(text): + """清理字符串中的非法字符""" + if isinstance(text, str): + # 替换孤立的代理字符 + return text.encode('utf-8', errors='replace').decode('utf-8') + return text + +def clean_data(data): + if isinstance(data, dict): + return {key: clean_data(value) for key, value in data.items()} + elif isinstance(data, list): + return [clean_data(item) for item in data] + elif isinstance(data, str): + return clean_text(data) + return data \ No newline at end of file diff --git a/jsonl_clear.py b/jsonl_clear.py new file mode 100644 index 0000000..c3af7a5 --- /dev/null +++ b/jsonl_clear.py @@ -0,0 +1,59 @@ +import json +import os +import argparse +from json_repair import repair_json + +def is_valid_json(json_str): + """检查字符串是否是有效的JSON""" + try: + json.loads(json_str) + return True + except json.JSONDecodeError: + return False + +def process_jsonl(input_file, output_file): + if not os.path.exists(input_file): + raise FileNotFoundError(f"输入文件不存在: {input_file}") + + id_counter = 1 # 自增ID计数器 + + with open(input_file, 'r', encoding='utf-8') as infile, \ + open(output_file, 'w', encoding='utf-8') as outfile: + + for line_number, line in enumerate(infile, start=1): + line = line.strip() + if not line: + continue + + # 如果能解析就处理 + if is_valid_json(line): + data = json.loads(line) + else: + # 尝试修复 + try: + repaired = repair_json(line) + if repaired and is_valid_json(repaired): + data = json.loads(repaired) + else: + continue + except Exception as e: + print(f"第 {line_number} 行: 无法修复的JSON行: {line} | 错误: {e}") + continue + + # 添加自增 id 字段 + data['id'] = id_counter + id_counter += 1 + + # 写入文件,保留中文等非 ASCII 字符 + outfile.write(json.dumps(data, ensure_ascii=False) + '\n') + + print(f"处理完成,共写入 {id_counter - 1} 条数据。结果保存到:{output_file}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="处理 JSONL 文件,添加自增 ID,并修复非法 JSON") + parser.add_argument("--input", type=str, required=True, help="输入的 JSONL 文件路径") + parser.add_argument("--output", type=str, required=True, help="输出的 JSONL 文件路径") + + args = parser.parse_args() + + process_jsonl(args.input, args.output) \ No newline at end of file diff --git a/process.log b/process.log new file mode 100644 index 0000000..e69de29 diff --git a/process.sh b/process.sh new file mode 100644 index 0000000..c5cabec --- /dev/null +++ b/process.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +folderPath=$1 +fileName=$2 +major=$3 +split_size=$4 + +echo "正在进行文件预处理,请稍后" +python3 step1_pre.py "$folderPath/$fileName" "$major" "$split_size" +echo "文件预处理完成,请查看spilited_ai1文件夹中内容" + +echo "现在开始处理AI答案正确检测...第一遍" +i=0 +for file in $(ls "$folderPath"/spilited_ai1 | sort -V) +do + echo "启动任务,文件路径是:$folderPath/spilited_ai1/$file" + nohup python3 step2_ai1.py "$folderPath" "$folderPath/spilited_ai1/$file" "$i" > /dev/null 2>&1 & + i=$((i+1)) +done + +# 检测AI1任务是否完成 +while true; do + file_count=$(ls -1 "$folderPath/ai_1" | grep -v '^d' | wc -l) + echo "AI1当前文件数量为" + echo "$file_count" + if [ "$file_count" -ge "$i" ]; then + echo "AI1任务全部完成,继续执行后续操作..." + break + else + echo "未达到" + echo "$i" + echo "个,等待10秒后继续..." + sleep 10 + fi +done + +echo "现在开始处理AI答案正确检测...第二遍" +j=0 +for file in $(ls "$folderPath"/ai_1 | sort -V) +do + echo "启动任务,文件路径是:$folderPath/ai_1/$file" + nohup python3 step3_ai2.py "$folderPath" "$folderPath/ai_1/$file" "$j" > /dev/null 2>&1 & + j=$((j+1)) +done + +# 检测AI2任务是否完成 +while true; do + file_count=$(ls -1 "$folderPath/ai_2" | grep -v '^d' | wc -l) + echo "AI2当前文件数量为" + echo "$file_count" + if [ "$file_count" -ge "$i" ]; then + echo "AI2任务全部完成,继续执行后续操作..." + break + else + echo "未达到" + echo "$i" + echo "个,等待10秒后继续..." + sleep 10 + fi +done + +echo "现在开始合并最终结果" +cat "$folderPath"/ai_2/* > "$folderPath"/ai_2_total/final_output.jsonl + +echo "处理完成,请注意合并数据时课程与后续id重设等操作!" \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..59331b8 --- /dev/null +++ b/readme.md @@ -0,0 +1,25 @@ +# 用于批量处理AI题目的检测功能 +会自动处理原始数据、进行数据过滤、第一次ai检测、第二次ai检测,最终合并在ai_2_total中。 +分割文件大小自主控制一下,使一个文件分割成100个左右的文件来并行执行 + + +# 使用方法 +1. 在服务器上创建一个目录,例如./test_01,也可以用绝对路径,将原始jsonl文件放在路径中,例如./test_01/test.jsonl +2. 执行 sh start.sh ,按提示输入参数,例如: +请将原始文件放在一个单独目录,请输入文件夹路径: +./test_01 +请输入文件名: +test.jsonl +请输入处理的科目名称,例如:物理: +测试 +请输入AI并行提问分割文件的大小,例如:10000: +10 + +3. 运行成功后,会在./test_01下创建多个目录 +transformed: 数据初步有有效性筛选后保存在这里 +spilited_ai1:分割后的文件,用于第一次AI检测 +ai_1:第一次AI检测后的存储目录 +ai_2:第二次AI检测后的存储目录 +ai_2_total:最终两次检测后合并在一起的数据 + +启动后会在process.log中可以查看日志情况。 \ No newline at end of file diff --git a/replace.py b/replace.py new file mode 100644 index 0000000..c7782e7 --- /dev/null +++ b/replace.py @@ -0,0 +1,30 @@ + + +import logging +import sys + +from components.ai_check import process_jsonl_file + +# 调用方法 +# python3 step1_ai1.py parent_path output_path file_path file_index + +parent_path = sys.argv[1] +file_path = sys.argv[2] +file_index = sys.argv[3] + + +# 配置日志记录器 +logging.basicConfig( + level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别) + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式 + filename=parent_path+"/log/ai1_log_"+str(file_index)+'.log', # 将日志写入文件 + filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加) +) +# 创建日志记录器 +logger = logging.getLogger(__name__) + +if __name__ == '__main__': + # # 读取数据文件 + print("Start") + temp_filepath = file_path + process_jsonl_file(temp_filepath,parent_path+"/ai_1",logger=logger) \ No newline at end of file diff --git a/replace_answer_detail.py b/replace_answer_detail.py new file mode 100644 index 0000000..dc68405 --- /dev/null +++ b/replace_answer_detail.py @@ -0,0 +1,64 @@ +import json +import sys + +def load_b_file(filepath): + """读取文件 B,并返回以 id 为 key 的字典""" + b_data = {} + with open(filepath, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + # 只保留需要更新的字段 + if 'id' in data and 'answer_detail' in data: + b_data[data['id']] = data['answer_detail'] + else: + print(f"警告:文件 B 中缺少必要字段: {line}") + except json.JSONDecodeError as e: + print(f"解析失败(文件 B):{e} -> {line[:50]}...") + return b_data + +def update_a_file(a_filepath, b_dict, output_filepath): + """读取文件 A,根据 b_dict 替换 answer_detail,并写入输出文件""" + with open(a_filepath, 'r', encoding='utf-8') as fin, \ + open(output_filepath, 'w', encoding='utf-8') as fout: + + for line_num, line in enumerate(fin, start=1): + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + record_id = data.get('id') + + # 如果在 B 中有对应 id,则替换 answer_detail + if record_id in b_dict: + data['answer_detail'] = b_dict[record_id] + + # 写回文件 + fout.write(json.dumps(data, ensure_ascii=False) + '\n') + + except json.JSONDecodeError as e: + print(f"第 {line_num} 行解析失败:{e} -> {line[:50]}...") + +def main(): + if len(sys.argv) != 4: + print("用法: python replace_answer_detail.py <文件A路径> <文件B路径> <输出文件路径>") + sys.exit(1) + + a_file = sys.argv[1] + b_file = sys.argv[2] + output_file = sys.argv[3] + + print("正在加载文件 B ...") + b_dict = load_b_file(b_file) + + print(f"共加载 {len(b_dict)} 条记录。开始处理文件 A ...") + update_a_file(a_file, b_dict, output_file) + + print("处理完成!结果已保存到:", output_file) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ff61b58 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +wheel @ file:///opt/homebrew/Cellar/python%403.13/3.13.2/libexec/wheel-0.45.1-py3-none-any.whl#sha256=b9235939e2096903717cb6bfc132267f8a7e46deb2ec3ef9c5e234ea301795d0 diff --git a/reset_id.py b/reset_id.py new file mode 100644 index 0000000..55c9a83 --- /dev/null +++ b/reset_id.py @@ -0,0 +1,100 @@ +# 用于交付前重制id列表 +# 同时处理每条数据,删除问题内容前的序号 + +import argparse +import jsonlines +from tqdm import tqdm +import re + +html_tag_pattern = re.compile(r'<[^>]+>') # 用于检测 HTML 标签 +unicode_pattern = r'\\u[0-9a-fA-F]{4,}' +# 输入文件路径和输出文件路径 +def parse_args(): + parser = argparse.ArgumentParser(description="Find high Jaccard similarity entries in JSONL file") + parser.add_argument("--input", required=True, help="Input JSONL file path") + parser.add_argument("--output", required=True, help="Input JSONL file path") + parser.add_argument("--major", required=True, help="Input JSONL file path") + parser.add_argument("--start_id", type=int, default=1, help="Start ID (default: 0)") + return parser.parse_args() + + +# 获取输入文件的总行数(用于进度条) +def count_lines(file_path): + with open(file_path, "r", encoding="utf-8") as f: + return sum(1 for _ in f) +def process_file(input_file, output_file,start_id,major): + total_lines = count_lines(input_file) + # 打开输入文件进行逐行读取,打开输出文件进行逐行写入 + with jsonlines.open(input_file, mode="r") as reader, jsonlines.open(output_file, mode="w") as writer: + new_id = start_id # 初始化新的 ID 从 1 开始 + + # 使用 tqdm 包裹 reader,显示进度条 [[10]] + for line in tqdm(reader, total=total_lines, desc="Processing", unit="line"): + q_main = line.get("q_main", "") + answer_detail = line.get("answer_detail", None) + std_ans = line.get("std_ans", None) + keypoint = line.get("keypoint", None) + major_2 = line.get("major_2", None) + # 跳过条件: + # 1. q_main 以数字开头 + # 2. answer_detail 不存在 或 不是字符串 + # 3. std_ans 不存在 或 不是字符串 + # if ( + # re.match(r'^\s*\d(?!\d)', q_main) or + # not isinstance(answer_detail, str) or + # not isinstance(std_ans, str) or + # not isinstance(keypoint, str) or + # not isinstance(major_2, str) or + # html_tag_pattern.search(q_main) or + # html_tag_pattern.search(answer_detail) or + # html_tag_pattern.search(std_ans) or + # re.search(unicode_pattern, major_2) + # ): + # continue + # 修改当前行的 id 字段 + line["id"] = new_id + # line["grade"] = "研究生" + # line["major"] = major + # line["q_main"] = full_to_half(remove_prefix(line["q_main"])) + # line["answer_detail"] = full_to_half(line["answer_detail"]) + # line["std_ans"] = full_to_half(line["std_ans"]) + # 写入修改后的行到输出文件 + writer.write(line) + # 更新 ID + new_id += 1 + +def remove_prefix(question): + """ + 移除 question 字段开头的序号,例如: + - "1.", "2.", "1. ", "2. " + - "1.", "2." + - "1、" + - "1)" + - "2 题目:" + - "1题:"、"2题:" + - 处理类似 "2\\.", "3\\." 这种形式 + """ + # 正则表达式匹配各种可能的前缀 + pattern = r'^\s*\d+[\..\\))、]|\d+\s*题目:|\d+题:' + result = re.sub(pattern, '', question).lstrip() + return result +def full_to_half(text): + """将文本中的全角字符转换为半角字符""" + res = '' + for char in text: + code = ord(char) + if code == 12288: # 全角空格 + res += chr(32) + elif 65281 <= code <= 65374: # 全角标点符号 + res += chr(code - 65248) + else: + res += char + return res + + +if __name__ == "__main__": + args = parse_args() + parser = argparse.ArgumentParser(description="JSONL格式验证工具") + process_file(args.input, args.output,args.start_id,args.major) + print("ID 重置完成,已保存到新文件:", args.output) + \ No newline at end of file diff --git a/start.sh b/start.sh new file mode 100644 index 0000000..b19f0fe --- /dev/null +++ b/start.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +folderPath=$1 +fileName=$2 +major=$3 +split_size=$4 + +echo "正在进行文件预处理,请稍后" +python3 step1_pre.py "$folderPath/$fileName" "$major" "$split_size" +echo "文件预处理完成,请查看spilited_ai1文件夹中内容" + +echo "现在开始处理AI答案正确检测...第一遍" +i=0 +for file in $(ls "$folderPath"/spilited_ai1 | sort -V) +do + echo "启动任务,文件路径是:$folderPath/spilited_ai1/$file" + nohup python3 replace.py "$folderPath" "$folderPath/spilited_ai1/$file" "$i" > /dev/null 2>&1 & + i=$((i+1)) +done \ No newline at end of file diff --git a/step1_pre.py b/step1_pre.py new file mode 100644 index 0000000..942b296 --- /dev/null +++ b/step1_pre.py @@ -0,0 +1,69 @@ +import logging +import os +from pathlib import Path +import sys +from components.ai_check import get_spilit_file_list, process_jsonl_file +from components.spilit import split_jsonl +from components.step1 import generate_processed_data, load_ori_file, ori_data_validate +from step4_major import start_process_major2 + +#输入信息 +#完整文件路径 +start_file_path = sys.argv[1] +major = sys.argv[2] +split_size = sys.argv[3] + +# 获取文件名,文件路径 +p = Path(start_file_path) +ori_file_name = str(p.name) +ori_file_path = str(p.parent) +ori_file_pure_name = str(p.stem) +question_type_list = ['填空题', '解答题'] + +# 初始化文件夹路径 +os.makedirs(ori_file_path+"/transformed", exist_ok=True) +os.makedirs(ori_file_path+"/major2_processed", exist_ok=True) +os.makedirs(ori_file_path+"/spilited_ai1", exist_ok=True) +os.makedirs(ori_file_path+"/ai_1", exist_ok=True) +os.makedirs(ori_file_path+"/ai_2", exist_ok=True) +os.makedirs(ori_file_path+"/ai_2_total", exist_ok=True) +os.makedirs(ori_file_path+"/log", exist_ok=True) + +# 配置日志记录器 +logging.basicConfig( + level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别) + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式 + filename=ori_file_path+"/log"+ori_file_pure_name+'.log', # 将日志写入文件 + filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加) +) + +# 创建日志记录器 +logger = logging.getLogger(__name__) + +def process_wrapper(args): + ai1_file_path, output_dir, logger = args + process_jsonl_file(ai1_file_path, output_dir, logger) + +if __name__ == '__main__': + # 非json数据过滤 + # 题型过滤 + # 【如图】关键字过滤 + # 简单回答过滤 + # filtered_data_list = load_ori_file(start_file_path,question_type_list) + # 数据格式转换 + # generate_processed_data(filtered_data_list,major,ori_file_path+"/transformed/"+ori_file_pure_name+".processed") + # Major_2数据过滤 + # start_process_major2(ori_file_path+"/transformed/"+ori_file_pure_name+".processed",ori_file_path+"/major2_processed/"+ori_file_pure_name+".processed",major) + # 文件大小分割 + split_jsonl(ori_file_path+"/major2_processed/"+ori_file_pure_name+".processed", + ori_file_path+"/spilited_ai1/"+ori_file_pure_name, chunk_size=int(split_size)) + # AI过滤第一遍 + # ai1_file_list = get_spilit_file_list(ori_file_path+"/spilited/") + # for ai1_file_path in ai1_file_list: + # process_jsonl_file(ai1_file_path,ori_file_path+"/ai_1/",logger) + # AI过滤第二遍 + + + # 整合结果和日志 + + \ No newline at end of file diff --git a/step2_ai1.py b/step2_ai1.py new file mode 100644 index 0000000..c7782e7 --- /dev/null +++ b/step2_ai1.py @@ -0,0 +1,30 @@ + + +import logging +import sys + +from components.ai_check import process_jsonl_file + +# 调用方法 +# python3 step1_ai1.py parent_path output_path file_path file_index + +parent_path = sys.argv[1] +file_path = sys.argv[2] +file_index = sys.argv[3] + + +# 配置日志记录器 +logging.basicConfig( + level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别) + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式 + filename=parent_path+"/log/ai1_log_"+str(file_index)+'.log', # 将日志写入文件 + filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加) +) +# 创建日志记录器 +logger = logging.getLogger(__name__) + +if __name__ == '__main__': + # # 读取数据文件 + print("Start") + temp_filepath = file_path + process_jsonl_file(temp_filepath,parent_path+"/ai_1",logger=logger) \ No newline at end of file diff --git a/step3_ai2.py b/step3_ai2.py new file mode 100644 index 0000000..bb31e58 --- /dev/null +++ b/step3_ai2.py @@ -0,0 +1,30 @@ + + +import logging +import sys + +from components.ai_check import process_jsonl_file + +# 调用方法 +# python3 step1_ai1.py parent_path output_path file_path file_index + +parent_path = sys.argv[1] +file_path = sys.argv[2] +file_index = sys.argv[3] + + +# 配置日志记录器 +logging.basicConfig( + level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别) + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式 + filename=parent_path+"/log/ai2_log_"+str(file_index)+'.log', # 将日志写入文件 + filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加) +) +# 创建日志记录器 +logger = logging.getLogger(__name__) + +if __name__ == '__main__': + # # 读取数据文件 + print("Start") + temp_filepath = file_path + process_jsonl_file(temp_filepath,parent_path+"/ai_2",logger=logger) \ No newline at end of file diff --git a/step4_major.py b/step4_major.py new file mode 100644 index 0000000..dd87c17 --- /dev/null +++ b/step4_major.py @@ -0,0 +1,68 @@ +# 用于交付前重制id列表 + +import argparse +import jsonlines +from tqdm import tqdm +import re + +# 输入文件路径和输出文件路径 +def parse_args(): + parser = argparse.ArgumentParser(description="处理Major2") + parser.add_argument("--input", required=True, help="Input JSONL file path") + parser.add_argument("--output", required=True, help="Input JSONL file path") + parser.add_argument("--major", required=True, help="输入科目") + return parser.parse_args() + + +# 获取输入文件的总行数(用于进度条) +def count_lines(file_path): + with open(file_path, "r", encoding="utf-8") as f: + return sum(1 for _ in f) +def process_file(input_file, output_file,major2_keywords): + total_lines = count_lines(input_file) + # 打开输入文件进行逐行读取,打开输出文件进行逐行写入 + with jsonlines.open(input_file, mode="r") as reader, jsonlines.open(output_file, mode="w") as writer: + # 使用 tqdm 包裹 reader,显示进度条 [[10]] + for line in tqdm(reader, total=total_lines, desc="Processing", unit="line"): + # 判断major2是否有效 + if line.get("major_2"): + if any(keyword in line["major_2"] for keyword in major2_keywords): + # 去除line["major_2"]开头的空格 + line["major_2"] = line["major_2"].lstrip() + # 判断line["major_2"]中是否包含英文字符 + if any('a' <= char.lower() <= 'z' for char in line["major_2"]): + print("major2中包含英文字符,请检查!内容是:"+str(line)) + continue + # 写入修改后的行到输出文件 + # 去除*、/、\、!、空格符号 + line["major_2"] = line["major_2"].replace('*', '').replace('/', '').replace('\\', '').replace('!', '').replace(' ', '') + # 去除括号及其内容 + line["major_2"] = re.sub(r'$$|$$|(|)|$|$|{|}|〈|〉|《|》|『|』|【|】|〖|〗|〘|〙|〚|〛', '', line["major_2"]) + writer.write(line) + else: + print("major2不合法,跳过该行,内容是:"+str(line)) + continue + + + +def start_process_major2(input,output,major): + major2_keywords = [] + if major == "化学": + #化学、有机、无机、分子 + major2_keywords = ["有机","无机","分子","化学"] + elif major == "物理": + #物理、天体、宇宙、行星、黑洞、原子、核、力、动力、力学、流体、光、光子、电、电子、磁、量子、超导、热、纳米、晶体、半导体、能量、相对、波动、振动 + major2_keywords = ["天体","宇宙","行星","黑洞","原子","核","力","动力","力学","流体","光","光子","电","电子","磁","量子","超导","热","纳米","晶体","半导体","能量","相对","波动","振动","物理"] + else: + print("请输入正确的科目!") + exit() + process_file(input, output,major2_keywords) + print("major2清理完成,已保存到新文件:", output) + + +if __name__ == "__main__": + args = parse_args() + parser = argparse.ArgumentParser(description="JSONL格式验证工具") + start_process_major2(args.input,args.output,args.major) + + \ No newline at end of file