import json from multiprocessing import Pool import re # JSONL分割函数 [[7]][[8]] def split_jsonl(input_path,output_path, chunk_size=100000): with open(input_path, 'r') as f: chunk = [] for i, line in enumerate(f): chunk.append(line) if (i+1) % chunk_size == 0: with open(output_path+f'_{i//chunk_size}.jsonl', 'w') as outfile: outfile.writelines(chunk) chunk = [] if chunk: # 处理剩余数据 with open(output_path+f'_{i//chunk_size}.jsonl', 'w') as outfile: outfile.writelines(chunk) # 标准化处理函数 [[8]][[9]] def process_question(json_line): data = json_line # 提取关键字段(根据实际JSON结构调整) question = data.get('q_main', '') answer = data.get('std_ans', '') # 文本压缩处理 # question = re.sub(r'\s+', ' ', question).strip() # question = re.sub(r'(\d)\s+(\d)', r'\1\2', question) # 合并断裂数字 # question = re.sub(r'(?