QUE_REPLACE/components/spilit.py

import json
from multiprocessing import Pool
import re

# JSONL分割函数 [[7]][[8]]
def split_jsonl(input_path,output_path, chunk_size=100000):
    with open(input_path, 'r') as f:
        chunk = []
        for i, line in enumerate(f):
            chunk.append(line)
            if (i+1) % chunk_size == 0:
                with open(output_path+f'_{i//chunk_size}.jsonl', 'w') as outfile:
                    outfile.writelines(chunk)
                chunk = []
        if chunk:  # 处理剩余数据
            with open(output_path+f'_{i//chunk_size}.jsonl', 'w') as outfile:
                outfile.writelines(chunk)

# 标准化处理函数 [[8]][[9]]
def process_question(json_line):
    data = json_line
    # 提取关键字段（根据实际JSON结构调整）
    question = data.get('q_main', '')
    answer = data.get('std_ans', '')
    
    # 文本压缩处理
    # question = re.sub(r'\s+', ' ', question).strip()
    # question = re.sub(r'(\d)\s+(\d)', r'\1\2', question)  # 合并断裂数字
    # question = re.sub(r'(?<![\d.])\d{4,}(?![\d.])', 
    #                  lambda m: f"{float(m.group()):.2e}", question)  # 科学计数法
    
    # 重组为最小化JSON结构
    return json.dumps({
        'id': data.get('id'),
        'question': question,
        'answer': answer
    }, ensure_ascii=False)

# 并行处理 [[7]]
def preprocess_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    with Pool() as pool:
        processed = pool.map(process_question, lines)
    with open(file_path.replace('.jsonl', '_processed.jsonl'), 'w') as f:
        f.write('\n'.join(processed))
init 1 month ago			`import json`
			`from multiprocessing import Pool`
			`import re`

			`# JSONL分割函数 [[7]][[8]]`
			`def split_jsonl(input_path,output_path, chunk_size=100000):`
			`with open(input_path, 'r') as f:`
			`chunk = []`
			`for i, line in enumerate(f):`
			`chunk.append(line)`
			`if (i+1) % chunk_size == 0:`
			`with open(output_path+f'_{i//chunk_size}.jsonl', 'w') as outfile:`
			`outfile.writelines(chunk)`
			`chunk = []`
			`if chunk: # 处理剩余数据`
			`with open(output_path+f'_{i//chunk_size}.jsonl', 'w') as outfile:`
			`outfile.writelines(chunk)`

			`# 标准化处理函数 [[8]][[9]]`
			`def process_question(json_line):`
			`data = json_line`
			`# 提取关键字段（根据实际JSON结构调整）`
			`question = data.get('q_main', '')`
			`answer = data.get('std_ans', '')`

			`# 文本压缩处理`
			`# question = re.sub(r'\s+', ' ', question).strip()`
			`# question = re.sub(r'(\d)\s+(\d)', r'\1\2', question) # 合并断裂数字`
			`# question = re.sub(r'(?<![\d.])\d{4,}(?![\d.])',`
			`# lambda m: f"{float(m.group()):.2e}", question) # 科学计数法`

			`# 重组为最小化JSON结构`
			`return json.dumps({`
			`'id': data.get('id'),`
			`'question': question,`
			`'answer': answer`
			`}, ensure_ascii=False)`

			`# 并行处理 [[7]]`
			`def preprocess_file(file_path):`
			`with open(file_path, 'r') as f:`
			`lines = f.readlines()`
			`with Pool() as pool:`
			`processed = pool.map(process_question, lines)`
			`with open(file_path.replace('.jsonl', '_processed.jsonl'), 'w') as f:`
			`f.write('\n'.join(processed))`