You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

46 lines
1.6 KiB

1 month ago
import json
from multiprocessing import Pool
import re
# JSONL分割函数 [[7]][[8]]
def split_jsonl(input_path,output_path, chunk_size=100000):
with open(input_path, 'r') as f:
chunk = []
for i, line in enumerate(f):
chunk.append(line)
if (i+1) % chunk_size == 0:
with open(output_path+f'_{i//chunk_size}.jsonl', 'w') as outfile:
outfile.writelines(chunk)
chunk = []
if chunk: # 处理剩余数据
with open(output_path+f'_{i//chunk_size}.jsonl', 'w') as outfile:
outfile.writelines(chunk)
# 标准化处理函数 [[8]][[9]]
def process_question(json_line):
data = json_line
# 提取关键字段(根据实际JSON结构调整)
question = data.get('q_main', '')
answer = data.get('std_ans', '')
# 文本压缩处理
# question = re.sub(r'\s+', ' ', question).strip()
# question = re.sub(r'(\d)\s+(\d)', r'\1\2', question) # 合并断裂数字
# question = re.sub(r'(?<![\d.])\d{4,}(?![\d.])',
# lambda m: f"{float(m.group()):.2e}", question) # 科学计数法
# 重组为最小化JSON结构
return json.dumps({
'id': data.get('id'),
'question': question,
'answer': answer
}, ensure_ascii=False)
# 并行处理 [[7]]
def preprocess_file(file_path):
with open(file_path, 'r') as f:
lines = f.readlines()
with Pool() as pool:
processed = pool.map(process_question, lines)
with open(file_path.replace('.jsonl', '_processed.jsonl'), 'w') as f:
f.write('\n'.join(processed))