You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
46 lines
1.6 KiB
46 lines
1.6 KiB
1 month ago
|
import json
|
||
|
from multiprocessing import Pool
|
||
|
import re
|
||
|
|
||
|
# JSONL分割函数 [[7]][[8]]
|
||
|
def split_jsonl(input_path,output_path, chunk_size=100000):
|
||
|
with open(input_path, 'r') as f:
|
||
|
chunk = []
|
||
|
for i, line in enumerate(f):
|
||
|
chunk.append(line)
|
||
|
if (i+1) % chunk_size == 0:
|
||
|
with open(output_path+f'_{i//chunk_size}.jsonl', 'w') as outfile:
|
||
|
outfile.writelines(chunk)
|
||
|
chunk = []
|
||
|
if chunk: # 处理剩余数据
|
||
|
with open(output_path+f'_{i//chunk_size}.jsonl', 'w') as outfile:
|
||
|
outfile.writelines(chunk)
|
||
|
|
||
|
# 标准化处理函数 [[8]][[9]]
|
||
|
def process_question(json_line):
|
||
|
data = json_line
|
||
|
# 提取关键字段(根据实际JSON结构调整)
|
||
|
question = data.get('q_main', '')
|
||
|
answer = data.get('std_ans', '')
|
||
|
|
||
|
# 文本压缩处理
|
||
|
# question = re.sub(r'\s+', ' ', question).strip()
|
||
|
# question = re.sub(r'(\d)\s+(\d)', r'\1\2', question) # 合并断裂数字
|
||
|
# question = re.sub(r'(?<![\d.])\d{4,}(?![\d.])',
|
||
|
# lambda m: f"{float(m.group()):.2e}", question) # 科学计数法
|
||
|
|
||
|
# 重组为最小化JSON结构
|
||
|
return json.dumps({
|
||
|
'id': data.get('id'),
|
||
|
'question': question,
|
||
|
'answer': answer
|
||
|
}, ensure_ascii=False)
|
||
|
|
||
|
# 并行处理 [[7]]
|
||
|
def preprocess_file(file_path):
|
||
|
with open(file_path, 'r') as f:
|
||
|
lines = f.readlines()
|
||
|
with Pool() as pool:
|
||
|
processed = pool.map(process_question, lines)
|
||
|
with open(file_path.replace('.jsonl', '_processed.jsonl'), 'w') as f:
|
||
|
f.write('\n'.join(processed))
|