From 061fa4d3f1020c4ba0268c0135a1f73914987b16 Mon Sep 17 00:00:00 2001 From: jiangAB Date: Fri, 13 Jun 2025 17:09:49 +0800 Subject: [PATCH] init --- .DS_Store | Bin 0 -> 6148 bytes .gitignore | 2 + __pycache__/step4_major.cpython-313.pyc | Bin 0 -> 4529 bytes asd.py | 3 + components/.DS_Store | Bin 0 -> 6148 bytes components/__init__.py | 0 .../__pycache__/__init__.cpython-313.pyc | Bin 0 -> 167 bytes .../__pycache__/ai_check.cpython-313.pyc | Bin 0 -> 3424 bytes .../doubao_process.cpython-313.pyc | Bin 0 -> 1609 bytes .../jsonl_repair_reindex.cpython-313.pyc | Bin 0 -> 2735 bytes components/__pycache__/spilit.cpython-313.pyc | Bin 0 -> 2462 bytes components/__pycache__/step1.cpython-313.pyc | Bin 0 -> 4634 bytes components/ai_check.py | 66 +++++++++ components/doubao_process.py | 45 ++++++ components/jsonl_repair_reindex.py | 59 ++++++++ components/spilit.py | 46 ++++++ components/step1.py | 135 ++++++++++++++++++ jsonl_clear.py | 59 ++++++++ process.log | 0 process.sh | 65 +++++++++ readme.md | 25 ++++ replace.py | 30 ++++ replace_answer_detail.py | 64 +++++++++ requirements.txt | 1 + reset_id.py | 100 +++++++++++++ start.sh | 19 +++ step1_pre.py | 69 +++++++++ step2_ai1.py | 30 ++++ step3_ai2.py | 30 ++++ step4_major.py | 68 +++++++++ 30 files changed, 916 insertions(+) create mode 100644 .DS_Store create mode 100644 .gitignore create mode 100644 __pycache__/step4_major.cpython-313.pyc create mode 100644 asd.py create mode 100644 components/.DS_Store create mode 100644 components/__init__.py create mode 100644 components/__pycache__/__init__.cpython-313.pyc create mode 100644 components/__pycache__/ai_check.cpython-313.pyc create mode 100644 components/__pycache__/doubao_process.cpython-313.pyc create mode 100644 components/__pycache__/jsonl_repair_reindex.cpython-313.pyc create mode 100644 components/__pycache__/spilit.cpython-313.pyc create mode 100644 components/__pycache__/step1.cpython-313.pyc create mode 100644 components/ai_check.py create mode 100644 components/doubao_process.py create mode 100644 components/jsonl_repair_reindex.py create mode 100644 components/spilit.py create mode 100644 components/step1.py create mode 100644 jsonl_clear.py create mode 100644 process.log create mode 100644 process.sh create mode 100644 readme.md create mode 100644 replace.py create mode 100644 replace_answer_detail.py create mode 100644 requirements.txt create mode 100644 reset_id.py create mode 100644 start.sh create mode 100644 step1_pre.py create mode 100644 step2_ai1.py create mode 100644 step3_ai2.py create mode 100644 step4_major.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f21236e459a609b12dc968ecea37be90703fd58a GIT binary patch literal 6148 zcmeH~O^*^W7{{No=njh-4w~%6WQ^Ah^0FZ@UJ%!VS221}GY-sT86Bo9Gl+(eaMllF zy!lD|E}r!Nv{e}x;!TO^L)xcrKic`Xolc2J^kzYms76E%0{ik2q5|W34qH|-C0F5v z`i~*`G^7!Qd}PWWAFanzuu!*4hOLupas7kJ|Wbr;QW0+374#+{N7%(x>gta#?Y1+`e<~vG*n(C-Q@OViI`EDPz^v?ISXUdc5sV zlUOF_sAskcwWiQ(2)$DL>J%e)o>xuWM9aKtn^a1p4TeXs&Q#D(h}jGkS=()~oDo+tsgt)b~BCbG%49tX@ zY7O$J3pmzf2iF4UqrYY$2VHh>ZNs$+IjEE4uKgP4DY!9ue2tl#7O|{Mj)hQrX=XJ$ z)x8{@9h6nTD)28A;P-=zz`kqzN_+L{K%}k!@MRP$Lz+Jpm~oBo8o$yWH87!3c#R5k z#Sj`txog^WjbCZ6aT4b8AxzK0+)#w-9slpD<|Mk>+t?~#704@aptChT{~zwY|K}l_ zvkF)R{woDU`K*7|!y}oqb?f2ytaTAyA+T|~mG%ljWR7E%;G_5$K^f{?PQZ7KUulmL QnEen?GT6u}@K+W10XlT=n*aa+ literal 0 HcmV?d00001 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ac3693 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +work_data +venv \ No newline at end of file diff --git a/__pycache__/step4_major.cpython-313.pyc b/__pycache__/step4_major.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb9da5fc4c8bbfc460f1abeb72d8a2b1c12c9f63 GIT binary patch literal 4529 zcma)9eNeiiP zX71d%bKjkrJHIAQJ@GNpW;3P4f(Bt=kEE(KJv`Sx@{_Bxw_t~j)ho??n zu&|=miJQDmEb!vICvlU%jd%N=B~2`*I2#%q&@1soy(C210_=cVYGHu_vrXa#Yb^{A ztSv65yQ#(H-a~4ux5L-$aX*VaEquMtT3!3JV`JshPud*Yimif=_deo~Y%210U^02B z&;cR507Vy?NX?3*7FI^s6g`Ai#J|)4a*ebJSj)TXJq<2*(_Kaql@CLnF+;H&h7W*?W}waBU~V|vzd{2l6WR*vXdtSo zM;TB|-=sj;b{>I~&_%06${`RnCKH|qm8e_S#Jl-DUc5oftM~ZbK1Zv|%?m~AT0Qm7 zR$)U?qMw|3Aq|R~utEJrC!B!=1FKIs|0A%#`)%3+_>{rVV0b#wO;!lDUYFYA3S=+GkwP({i=x03jZGj zEeQ9euV9AsYc%zVO#vqHA_)v0gL0Sp?yXcd$<5Tow+A6Rv42il2J0%p1Pzh7qb!a~gtcb_fo0lL9+L=0WJ3)z9gSiIG2{e<6$^J|HDdU$;P^Yt+gd z3W252;EgXPS(Kqj&(@rQOHq!B{a*d4_vxQ^pZ>J_^v|E|PpnEJyxFs7EmbH8APcw@ zWtEjMrHrjr*LKQPWqTb2hVFr9OUN)H`8@%&V0QJD{E?=?g29f{)_>%zzLGBk+e?`_)V-(D z3`3J90!NTwa4lsgcsnO4=T4y4(`#2Ew4O?wSeQ3l#wfzAq6ONdEC95n1)wDqXGF>= zQk~@fqiXfE^VmYmtH9PbL%>c#bZPP!MuQ{31WBZj=4Z5Dv5po59v&<>xAOGLGY>@; zm4p|Sj4X#bw5a5YHICz}iYfmZ7$fas@#N@mw68nb|HtW9--#XFA00k6^{2OQjrC1m zyc|2z6+3nI*4V!2E8Wox@5cfcZ;hQ0sg)uUDXU2B6se^mW%5^pu7ZNS@V_Z&3~~VSamXhipY#)qQ7n8d+TRnq zG(fDnd};dQ{nHoEPM_|BX0l4LoRx8q9lJo%Os(W1fVz7erM%cKLXVM$}8UGReV2gM8hR8)>p}j<|~oqCq++ za%>gVn0Ge79ZN0g!LkFBxM7mg6LMruN%mh{4&B82gyqbBrOQ z{gZ-EI<8p5Mtda39?G##3!Aq`%#M)R5q!QO*w`F4yCUY@A@lAa_6OTL!sgEJ5PEPU z{jF5qMyQP5L_@U&9xpS%tEA&bnOB@^e6taHik>3 z?ev6_hcd zI`Snc?VxHIB}M^1flZvcmQvy_xrSOawttg5aJ27ebogxx>nDxa>%FnpKaj~|g9;h# zc_ljBk2iyWG-RsxJiJ^iMoBLE*7?bgj>zPN{m~1@B@zx~EO1I9Z+>_<_Ltww0tZil zB0pJ?*x;yCM-QBoyABM^k|)6;i5q)&A5l&QM7w)s!R}r~$JFoMl~_dt5#V3E=6blCVcq%djJ=B*ei_ft++9 zCrBdWNT%R(VxL1n6B(JX4Fp#a;Z0C)MqQ+LS)^1UtV!EAZlV);huJg8I_YH8VO~DC;YvyQ5Z7`FGN3YUv$y4z=XYniOgybvqBSDLtKGX5p8HtXJ1{m5nP8)8Oi7 oLa7y9H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T02KczG$)vkyYXe`LuVi4maGb1Bo5i^hl0Gk>q#sB~S literal 0 HcmV?d00001 diff --git a/components/__pycache__/ai_check.cpython-313.pyc b/components/__pycache__/ai_check.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07cd2679d64230e1fb52c6a8526cd532c9932b2c GIT binary patch literal 3424 zcmbVO>2niT5`Uu^Nu$g7kS|PZ#T-G#mJBQb8=S($jT4HsqAe;;l}bosTSOiyZ${zB zeh?(SVnAMku|wFP%>wHjfJu>r1ha(AKVZi!>l(aOTeTX={>0%_QAO>C?KdOIKwwGj zv!~yw?$@uof8DbdixEM)oN8)cW(T)=xb#4cE*7Y%jj|hNh=3h-N@8|;#dhe;Ug0gz z+YwF>10mi=rR9@O3m*;N?8`s~PzzACk-I(FDv=xRS_8{2jf<|+@RXZcgzU_Y1FW;k zu}28;!Ph~W0du|x76R{Zr#pUUB{| z6pVCmf^*ZZts9-;{bGBF--$zeIG^bJt$UNFdDG5U8#ion`a&Jy5YO?V;PeJOzIM*{ zmLt4hvbJ%eM+gUk0hO^Q7!X7Z7C;90gmNHJl$c+B+J4d=-!{Bwe9@X8HjdBVHpUcB zn##wR@>w{6eB?&5mlq5=_~ocUacA%ssM=~`vyV{bOmaq z?B*SjORzV6@htJ_#Fg`@lka_-*p|cXHQ9 zi7Vf$OLAAIeBrz5WPDT}jZL3Bn7nid_T-+f`n_v2C;R38FXU6h$rIi3`9V-sjt$Lp z#}ZfnEO(xrzTTZ0?yHtB4W!O|FL!>Mxb|UkU__3Ns@tk05UHL^%cyg#X;}5+pdx2& z&?|EL)N|?u<=ElmiBrix#@9|?KQ4d#32=qW2mIBKy)RrPdEsl|oIEi?Oi-=b;RwhDSpels(88RQTINd4uQos}2@ zHj)u@UcW~{Lx~Xs9h{UK#vvbITo!^cK?8{i@U0<<357Xc((c6pk+YjIQ9-hVB4RiK z2o?(ZIV|adp|&>YO}o7UU?>LY6D70Ab%Z@WFF>ECRx%2r7mFT690Qy>B(s9|X@`|8 zt&w0bD_=4JW@1hN1QkHpR-&U{`LSRI`L;(ICV5QC&uE~ir9d`b3>zfMg()CWbXS84 zw(6rzQw7DRjPbgOlA4JENA%T1Ug2QrvCo%%#$Sx$4hE^ zEK^05@rH?_7_Ai-A%TB%97V_ zAKEjXzjS!v75z_^^*=P;)genxY)kK!{-*e%aqE(s=9eDmQBET@ZdreC9?E;+=Mt1_ zjqU$nf8V>&t&@zgyXjz4zaw5U{^HAHw$&rTSnk@tGwU8$$Zu|0)<0C<u#qeZ;Gs5>FO^(LU=gw@GT+@=N0npPE1Hf7~4P4`^hp{fEJM%i#QvPzD?hnHx)}qqU8C z`mY8U-Ow8w3-veh^f3N~9mr*7KLIiTl?r+9=By=PG2MtoGAW1GJmU=g$Qk;%)_S^} zY`S!7+_X#Upk0t6>oia?u{$K)+nx?@fR`ZHAA>$g6Y#?`*TQ8bHdC@;Crmd37MVF- zh+xj+6?}n!ot9`QcQDCklGY#T2n%*bG7z&t*~4=Z>8)D;?6Y~~Blvwg&yLxx^hvIm`QWf3&b3QWX#_B() zpUlaRwfDC7i+%j0b-`q5#XUV!th>vi!uj_MjJfPCVzgzLfQm2c@-WHPRA0g5nE?aF zfqK7a*QW7MwPo zG{%23!i|@#>)AGGwkQa=EzXQv7u_;1zNbZn<&VVHLxB{GhgUYT^bOY7Sfq#4LyV$2 zlb+P+!N6`z0ty!bkxJ(ZZihb<+3gLHL`1ThlFXD6TS*Bj2q>OK?)@23tgogKc3nrv z9|>}=;PtRYVpAYSxl2n1jJ52GMW+cYsTX)k+BRHg=_O_fGl`b1dw2kbg+R!!nsKIfi~ zd(OGPbL>^Gw;IsZ^~bTvCJunV9HFw<_04LQZf*hyAae}hHO^oLy08nnu^X~20JCuq z^mGmbpaube9OmL|D~NMZW-SKhiUIXG!qCh3%A6;FjU zT^d7)vk847bk0a>h=epFC97saJ{OU-N!606Y$lPFFm$4=mt3xvLIT|MU5!&Mg@73? zs~zGL2`K;}m(D;IdWPt=LoS(tJQQFRtPXi}*Ql%FK3D;>qM*DLj5_hlYq%cP9CYjM zA@`tA(QR;8TZXm|0BWz<&{uZR^^MLBv4h@^dNt7Jp{7^*iS3}`^-6mk z+*Gme0;6Ij1FU~_-97*tsHLvrx4|iT>_+1!K(O&dhChN-jBp@Rr)k zgD%_Kowg|IkL^hr<0xqhDMW}mfrwq5QZFc$F@l$ioq@Ci(cZwyjoZ&>XP-U#V(Id1XCPQ*^WCwxlw-Xkwt&$)X+$iV z(KLu<3rD(PuQD`v+^!*(YNd&y(fYB^N#>qFmhCZdLbq(zLKiH0J)wM>;)$K{u-S3d zQk|g0q$7`Pj_%S)i#UT3aF*R8zilMb^b&IKaDR_%UZm*jF>IVgnk5em^(uqCvE$uG zd&{4Wq0?3nd94cy?Er;=gv^U}txBep|568Nbmw?w#nzl`K#RDyW{J`=q8y0)1E^Lx4qd%E&%hx3PH`GJx7fm4NnQ~8*Z z_nm$$s70}1m0knbf}*6Cy7cdoLb- zo_0R&m0R~#zRc04JHq0pcb31sQEs-&Ah?mLaDziOJ0w7flR#HfNKvrg5jj;S=Ml!u zRQLwq9hBkiWxqpohSN#(27aB2&TovIrljO%80IOcdJ4pn0BXWnzF51@iymn&v|!d`~Kv;PXzCWLT+oWI`7;0SlG2DZq5mL-;Upf){+MZ Mz7h}EmU0FE0;VsdFaQ7m literal 0 HcmV?d00001 diff --git a/components/__pycache__/jsonl_repair_reindex.cpython-313.pyc b/components/__pycache__/jsonl_repair_reindex.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7875e18c302d2a14aa5e5c5a0e2da944d2aab732 GIT binary patch literal 2735 zcmbVNZ)_9i8Gp}rXW#kkI3Z3AA>d00E3p<5#-QZSBBiwhBP69hyj4mzbB!;-ket2u z&MQRwkWFlas)(>IOF*=w8?C&GF3~2n**dXqqf$TG1QvRv)zIP)J}qmIsMwc1?>T3u z)%dd4^7}sT^Y4A`z2EP-Rkz!TU?dhhk2yJn{z*30aI6Wd4294*l8{7oBAj0-OqqKd zw!v!aqz^Kf5s^e!BFst*%=TV~#5TW(ko*!ty|!?vmA1InAjvK{nkdZo^5Kf2-jXD) zSCDwgDG5!G`w2cS)5l%#*np>7WXLWeKN_(0qi@lE)KAsXsqNsl^y`tOD{tlAow$GP?Zw&OElp16#@>c`>GIj7 z@zML2P9Kz>?>^E|M;i>O+F%D0a!fNkL>*N65-}x&aRS4`al{_gbgYreMB`w?cM^lC zAw_Ely?7wlkUXUyOQ?r&;<(bMH@pxDMZ=-PhuWVDHS{Hhk_lB&b*+J<7>usn5?11B zOgXtTdCG9dwdfD!!FbHPI!w@KhrhNHrV+HlFl@yF=Q{6v#W}X`7FTu0Q~t$*;F%Zv zS;4=GD7NAgk9UQJ^|9uHke@uY30?Fy($iEU-M9XJiP3xyAA)v#{jIL|0w*%UY<^uo z7vIZ{HFc~6p&pAAK!yk^LA!0%+f%3>-inU^B_vv6nrItRZR_-u#1`~)k5KTpnMAOa zqXTJ#Qcz(s6k)?|%U)b9D64FQEz|}6T>#!5kL4F(t*R^^*$4Il1;ZOG&*BP5EJUgH z=UV}}1xl{7I3kWh&Zob0rsMxQvk(Pm+PZ#>P=pR|E>>kp>uUcWK0;na1un4?0%WuBgEG|tIVBVj+2;JYAC7I@rE<5CR2K}AC57EL`pAg z>~YmpIzXAX4#rg_3a7rLV1pqWgNezyY;gIuq{IwL(FkcpYhbc7+)141Q#8$NQG*y@ z4ort0RSVZSf))h#`De~PbH43t+hkx~*q#-(FO>K*HBGaD*{Y1#F;}ub!|i{_A!o_x z@yTsz$4$O*relG3kDi#^o%Y@2cg=ivu{x06J6m?Ex+Uw{GTM2k+&9sDq4~Ae(cow0 zzE4UwKD49KEt5y)JvCWR&4O?9#PH9Dr)uYXPhVhG>|EKF1%Dv@%{hPL*n!V{{;ORx zJLYR!vb8M>{%u!=FAb-6%=vd;WbSPgSG>sWJ>PY<>vHXtz@@;{ky$od+4i9~TN(Ph ztK;9hQBCV37CFE1h(^v`8NTt0hhIY*t5=Z?j__`p3{GvHa%bv0KMsDpH}gVo=6m1I z^hPt4M{jxMPrX(DSz*EN@uTgK;5NVOv6r}|N7IbOYeV>du3|*%saVXy&--(2%6j<-pgbA!Rv1T z&dmTovJg|WK7b&>*|4L&-h|YV6zME^iUAeAlzjR9>D=|x}D`FTyiWHgAx5w5WHc%V+e=vqw$6{12gQUZ~0nRdd4D gk>EY1Y^-)7a3L@myK>^u{s7X-b?FJ+cmSr!n#9nV^ZRMd> z;#5J~2-B(!r^>yNTaLLkA(x&i5Js^kRE^Y1Rc=X%io~h^ti3i$^SJa!%=|O+zh>r} zZ@pb#??NEL@8@Q82SR_)fh}xB+5Qle?~#mT<{ZMbXU{R8vY72G&vBTO9WvL7W*N-O zd@I6&EI{j+X2RlA8e}ZY5dLB&lS*)vn6z`kKOJxX%Fy_e(3VAkOaqndEYi7tr1O(i zPlX$(ST>?j1)&^R8kZG5D%k$DUUAsAok$U6wilKZwik9PbH^tTN`mi&9<(<%Bl8o{ zUYKt{vhV>%J!Hq8YD7hhinR>3rtq~)a6<5B`*b2&|Aldc+z6`R4^>>^@4cLNT!7mC z*UT?hGL5>>yU4B}3FkNkx7Y!7b)vD)PH`P*ig|;@p|@E~qq0R<93u>V3h@z!4I#qB zb{&1QW=2mD=31C1e8y09BB^>Vt>SoAC62fOGfCpQhPAAk(sb1fJBdrvja)Wn#IrNR zoyldZ16N`ur(cPg+D(-(Q$(UPCCT(PNRnMCJrlF;@Y@cz1xJ?7Jn6(Mq3~>QP z$XthJ4sA*G^Rvqbic))_tK{|13tP>d1>X;!FMKjTSgP|ao?ke>94OYcm%P4w*Akb% zx$fyIT-x-6wu0|0A6a2my6*I@blh{Sh0d(Ep1I#x3=aR&Tne;33WU}Jp+c@0=-uH^ z(+KmVrS)6kxf`|g{8NYOd|SR?{z_r65MOEiDP4%&9a-x>vEFpzo?P?|-XAY{n;&_* z*S+0^fuc7I0sKSEP9t)4tV!)VE*fq#IJ~>#r1P7;!Ch-@Q)=HeKZaf1>WegSKL??C z5bzHj6(59-fWGSUM+9N@fRpM+T@lv3dRheiq2P)%N)J7j-sFn(hzWZ8w$QV(-ujQA ze2)~QGYS*kYf-V38JV?Tj?TWuhTCJqO;Uha66I~J)_0@V#HhYkg;h9N=%XAae#G{m zuyc$!uEx@FO(%$mW;Pj%>n35fWSGPIATn`Y&8cQq%jkew-Mpsa5#YBvRX1~3jm6D` zriD4eO{-bLQ?w8vnM)fcpkAWGn1xV28PCS?eh7!#p)qOc6sNd}X7Cy~lTb}FW@o|& zVLCt!f$;!vC3j%%>=RGp;`N2=%g#dghUehiXsO<}II%F1*9!iP`kuLw7k*UdT|EBP z@lu0-ab{sApZ!`d1@@KvEzd>1+40PYg00UbzOL;V;)S+t2#j8RX?eJ)HxM+v$|^n6 zC}kO_^g8=84NX?5+P0xIuW+v-)=Bz=Eg7!?ga1#&DwGryb=w}bUS?&^hLz0Q8f2xl z*VpW9O%|daTd(!wRnWYi?B7VSU%ZV#$JDo3R4LXmhUqb4dOkP3u?k|;WSM5M1k;3Z zg`Jpw;faLRc+#R(Os|S?>9}EmiilV>Qt^aJ1dDnBvl&ef3q+y>YEdutQ4hiLCcFXP zaUxaInY0Ls+zNC(2E?nNmqx(v!#e%qmaS`7BwiS|0@K zJgxoSK{y2V<;Uzina(70DfKj_Kcpy7P5SPiaSX#eM(*EG=VK(jaJZSnfA8}#ZQF+& IOvrNo7f9ynbpQYW literal 0 HcmV?d00001 diff --git a/components/__pycache__/step1.cpython-313.pyc b/components/__pycache__/step1.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6151aa37b3174aa081c7878eadf9a26134eebb96 GIT binary patch literal 4634 zcma)AZBQG>8Q%MnP9JarNq|5W!eFp$12zVK5XUJF5H|@V;gFk#WHc9afE;mB?xZ-7 zOox1vOq$S+-H?x_bmAtssasDv(>iUM+DT^mqkp78!u7%rPh)WWE7*7@iTk5{cMnN8 z@npJ;-hFrP-FI>o67KFXCx>2XnLG7WT=&Z}oKw&)kd@TBHig|@NPYQ2suy?M+g4e*4{0M287%gkqfVCw85ueAKE@&szu$p&OZMG(^>)Z78?a zC`4q44x(!C7DT=GnHC{iuZkCAQ0i4Kof>M~6QT@WRt6K%m@<43#-MS3m#QYzU{p3c zgSU!z^ac}3FdkFn;5gkL953%rq@+0M(1J-dF(_-A9BmJVc!NzOWyN3*YYD~R;t46L z8Jw0dTIRU<1=+{ZTa<JvK-jYywqXuxQRVmgY~mF?KyCR^@H6E=pVFGSigbj`|US7t%GjLF$T1SKz=1=Ii@K}wh^?D}fRM1|_0hH0}NqH_B_#P+5w zV88U{<4do;-`yXg4O`d4gK`q@7Zs;0^f>dI1@@ys$r{dsU{EcGO2w&4#70oaq9@P_ zZRNbW!#(xroTFBESDbT~f4%7NtRkB8F7ic_()0YH;Q7xA2komG9JlCkiyb{@4>3m_ zbSw$Hp=xwKI1uSR1m| z23>IL4%aG}qrv<~g+?*_$hPQ*uK-{KZ~?HvZYMwyp+r_>KypM{h_n*P6KNA|@YGJE zgGeWlMMM@8DTo5txQHwvvXn?Sk?>UXG|^p@!i5pQQ3SS~1>ndcdU5E&9ND4(6n7so zdq^)lZGfkD&^?t1iDhDWn1T@%_?hSnGoV+l*;TF8{@l2=dc#^Bz_nQ2-QGg{yVeH95ZkOoTw-N&2n> zUjW0jxKXU#O9PPG!nVFV9GZTy4Cnn_2R;jLSXb~ljNlN+gFYMr5O;+q>CSiXJsF4; zv^)4h!ha9YwX6Z%rd}00GtY%rPhEWF{WK0}zJ7b!U`JxZBL>x$#=hH@r+)=Kk{Ts; z=TDzBETeKXmL4^zR)g9}j9S5nC_Ax&=R5zm7a*GwM+|BQd9VXdL$!ox(rO`-;uJFM z(b&*XY%m>9jiqs9FTOo};pDT+$6o&8@psIZJpR_w$u|wFmcAcDkU@`+7<>-&04hle z8a7Fh;$!0g)`swak~k8Vqr-9}nE<#nSd5Gzr{Ne@r6>SlT+#r2NyFeqrNar;u#zg$ zV(@VQy|gqe8+0scSRRawN-+hTr=mF7jUv-sBr2z*SlqDVxJ2UeLvq~UACSjPe=rJr z9IrI#0D<*2wm_DkwFlGqxL7{<#?ouEOHaSN^we*bjz4)}>glDa7cRW=}VGN>a!N!U@~0%^Rv)`tE5`Q^!}FW-9%D29D7E=vlb5-Lzf zq(U+xa>}dKpcytyJ}~xcDo@%+)L2T6Nbz{cu9jlkcHHeqq*KXs%48S3mXg#|1VZ7? z^BI$@5N1(a_;?Vgq)AI)(ov(hV2<0WA5U!|9#swJQZQc#(ir8#vLdU%ejS);*@~#d?`JwjApfrtZUfr8!R( z*8Qfw0qX&CY&F({rhN_8H=6ymSg%7RWxC6Ixd9pzy{k>gyK$0Vq5U@RIiY;onH4t8 z2Ihp_dT937V3Y2v(R~5Es&?jw*{UYpAJoeNdgaEM z_H1Pn?md{TYSe2&vs<$@yYxW)%%j;rtM0GMY}%Fe@5Y9^vXxsd`HBL?lf730D6o09 zcy7b)$zI)Az2L0RI_qbCGVk1_J3Z6XvpvrpywnEocx69fTJ^`Z%+~#L4L4=#e~__v z%(Dj;SzE@wcb;uuw3WUXcrGyQn61TRuU_Yi7x;=SUy<=`o97z|UptSDt3F}7cVhS%<5ot9SX7O8gRON`YD{= zCtteoyEo6Deg{rb$n%T8e0}+k$IMO@zoTJQTr}oYEjakjdL}X94b3e+=DY@Y7mAV0Kb6RreaRVD)FM{&Rx+ z#YdifWP11P-pNNYLfe9{FDvXj{i8XdE5mo`4q@UpwGM_|!`SfYLBiP$9{=xf0%N{@ zICFPZH#$TeqBUkakS(AdB>+-Df|8|SssPF1{Jkp%tp`8s<2F-;tE> zd{#J#Fts_vXmqpb2}l>*v-1!fW^xdGC*1?|mH40)=35@}gy1_K0Hz1<0Za<;QT$z? zm>WW)b8kEZ-a&_%hfs*>4O!KGu!4(O3{Z~4r>3TMlOBB5s$o#cw+yjr8SZ8Pa10L6 zB@MSZVvg||AFC#J$79I^6QF2MfS5pw*j>&3|~#Q&^N0N 5: + return False + # 使用正则表达式判断是否包含中文 + import re + if re.search(r'[\u4e00-\u9fa5]', s): + return True + return False + + +def generate_processed_data(ori_data_list,major,output_path): + start_id = 1 + processed_data_list = [] + difficulty_map = { + '困难': '难', + '高': '难', + 'high': '难', + '5': '难', + '5星': '难', + 'hard': '难', + '中': '中', + 'medium': '中', + '4': '中', + '3': '中', + '4星': '中', + '3星': '中', + '低': '低', + 'low': '低', + '1': '低', + '1星': '低', + '2': '低', + '2星': '低', + 'easy': '低', + } + + #major和major_2的映射关系 + for ori_item in ori_data_list: + try: + # 处理难度 + if ori_item['difficulty'] in difficulty_map: + ori_item['difficulty'] = difficulty_map[ori_item['difficulty']] + else: + # 如果不匹配任何规则,可以选择跳过或记录日志 + continue + # 处理数据 + processed_item = { + "grade_class": "高等教育", + "grade": "大学", + "major": major, + "major_2": ori_item['subject'], + "language": "zh", + "id": start_id, + "q_main": ori_item['question'], + "std_ans": ori_item['answer'], + "answer_detail": ori_item['analyzing'], + "hard_level": ori_item['difficulty'], + "keypoint": ori_item['knowledge_point'], + "q_type": ori_item['type'] + } + #清理html标签、不可见字符、异常字符 + processed_item = clean_data(processed_item) + processed_data_list.append(processed_item) + start_id += 1 + except Exception as e: + # logger.warning(f"KeyError: {e} in item: {ori_item}") + continue + # 将列表保存为 .jsonl 文件,这一步是最早的数据过滤和格式整合结果 + print(f"Total valid JSON objects: {len(processed_data_list)}") + print("正在写入处理后的文件,请稍等...") + with jsonlines.open(output_path, mode='w') as writer: + writer.write_all(processed_data_list) + print("写入完成!") + + + +def clean_text(text): + """清理字符串中的非法字符""" + if isinstance(text, str): + # 替换孤立的代理字符 + return text.encode('utf-8', errors='replace').decode('utf-8') + return text + +def clean_data(data): + if isinstance(data, dict): + return {key: clean_data(value) for key, value in data.items()} + elif isinstance(data, list): + return [clean_data(item) for item in data] + elif isinstance(data, str): + return clean_text(data) + return data \ No newline at end of file diff --git a/jsonl_clear.py b/jsonl_clear.py new file mode 100644 index 0000000..c3af7a5 --- /dev/null +++ b/jsonl_clear.py @@ -0,0 +1,59 @@ +import json +import os +import argparse +from json_repair import repair_json + +def is_valid_json(json_str): + """检查字符串是否是有效的JSON""" + try: + json.loads(json_str) + return True + except json.JSONDecodeError: + return False + +def process_jsonl(input_file, output_file): + if not os.path.exists(input_file): + raise FileNotFoundError(f"输入文件不存在: {input_file}") + + id_counter = 1 # 自增ID计数器 + + with open(input_file, 'r', encoding='utf-8') as infile, \ + open(output_file, 'w', encoding='utf-8') as outfile: + + for line_number, line in enumerate(infile, start=1): + line = line.strip() + if not line: + continue + + # 如果能解析就处理 + if is_valid_json(line): + data = json.loads(line) + else: + # 尝试修复 + try: + repaired = repair_json(line) + if repaired and is_valid_json(repaired): + data = json.loads(repaired) + else: + continue + except Exception as e: + print(f"第 {line_number} 行: 无法修复的JSON行: {line} | 错误: {e}") + continue + + # 添加自增 id 字段 + data['id'] = id_counter + id_counter += 1 + + # 写入文件,保留中文等非 ASCII 字符 + outfile.write(json.dumps(data, ensure_ascii=False) + '\n') + + print(f"处理完成,共写入 {id_counter - 1} 条数据。结果保存到:{output_file}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="处理 JSONL 文件,添加自增 ID,并修复非法 JSON") + parser.add_argument("--input", type=str, required=True, help="输入的 JSONL 文件路径") + parser.add_argument("--output", type=str, required=True, help="输出的 JSONL 文件路径") + + args = parser.parse_args() + + process_jsonl(args.input, args.output) \ No newline at end of file diff --git a/process.log b/process.log new file mode 100644 index 0000000..e69de29 diff --git a/process.sh b/process.sh new file mode 100644 index 0000000..c5cabec --- /dev/null +++ b/process.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +folderPath=$1 +fileName=$2 +major=$3 +split_size=$4 + +echo "正在进行文件预处理,请稍后" +python3 step1_pre.py "$folderPath/$fileName" "$major" "$split_size" +echo "文件预处理完成,请查看spilited_ai1文件夹中内容" + +echo "现在开始处理AI答案正确检测...第一遍" +i=0 +for file in $(ls "$folderPath"/spilited_ai1 | sort -V) +do + echo "启动任务,文件路径是:$folderPath/spilited_ai1/$file" + nohup python3 step2_ai1.py "$folderPath" "$folderPath/spilited_ai1/$file" "$i" > /dev/null 2>&1 & + i=$((i+1)) +done + +# 检测AI1任务是否完成 +while true; do + file_count=$(ls -1 "$folderPath/ai_1" | grep -v '^d' | wc -l) + echo "AI1当前文件数量为" + echo "$file_count" + if [ "$file_count" -ge "$i" ]; then + echo "AI1任务全部完成,继续执行后续操作..." + break + else + echo "未达到" + echo "$i" + echo "个,等待10秒后继续..." + sleep 10 + fi +done + +echo "现在开始处理AI答案正确检测...第二遍" +j=0 +for file in $(ls "$folderPath"/ai_1 | sort -V) +do + echo "启动任务,文件路径是:$folderPath/ai_1/$file" + nohup python3 step3_ai2.py "$folderPath" "$folderPath/ai_1/$file" "$j" > /dev/null 2>&1 & + j=$((j+1)) +done + +# 检测AI2任务是否完成 +while true; do + file_count=$(ls -1 "$folderPath/ai_2" | grep -v '^d' | wc -l) + echo "AI2当前文件数量为" + echo "$file_count" + if [ "$file_count" -ge "$i" ]; then + echo "AI2任务全部完成,继续执行后续操作..." + break + else + echo "未达到" + echo "$i" + echo "个,等待10秒后继续..." + sleep 10 + fi +done + +echo "现在开始合并最终结果" +cat "$folderPath"/ai_2/* > "$folderPath"/ai_2_total/final_output.jsonl + +echo "处理完成,请注意合并数据时课程与后续id重设等操作!" \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..59331b8 --- /dev/null +++ b/readme.md @@ -0,0 +1,25 @@ +# 用于批量处理AI题目的检测功能 +会自动处理原始数据、进行数据过滤、第一次ai检测、第二次ai检测,最终合并在ai_2_total中。 +分割文件大小自主控制一下,使一个文件分割成100个左右的文件来并行执行 + + +# 使用方法 +1. 在服务器上创建一个目录,例如./test_01,也可以用绝对路径,将原始jsonl文件放在路径中,例如./test_01/test.jsonl +2. 执行 sh start.sh ,按提示输入参数,例如: +请将原始文件放在一个单独目录,请输入文件夹路径: +./test_01 +请输入文件名: +test.jsonl +请输入处理的科目名称,例如:物理: +测试 +请输入AI并行提问分割文件的大小,例如:10000: +10 + +3. 运行成功后,会在./test_01下创建多个目录 +transformed: 数据初步有有效性筛选后保存在这里 +spilited_ai1:分割后的文件,用于第一次AI检测 +ai_1:第一次AI检测后的存储目录 +ai_2:第二次AI检测后的存储目录 +ai_2_total:最终两次检测后合并在一起的数据 + +启动后会在process.log中可以查看日志情况。 \ No newline at end of file diff --git a/replace.py b/replace.py new file mode 100644 index 0000000..c7782e7 --- /dev/null +++ b/replace.py @@ -0,0 +1,30 @@ + + +import logging +import sys + +from components.ai_check import process_jsonl_file + +# 调用方法 +# python3 step1_ai1.py parent_path output_path file_path file_index + +parent_path = sys.argv[1] +file_path = sys.argv[2] +file_index = sys.argv[3] + + +# 配置日志记录器 +logging.basicConfig( + level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别) + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式 + filename=parent_path+"/log/ai1_log_"+str(file_index)+'.log', # 将日志写入文件 + filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加) +) +# 创建日志记录器 +logger = logging.getLogger(__name__) + +if __name__ == '__main__': + # # 读取数据文件 + print("Start") + temp_filepath = file_path + process_jsonl_file(temp_filepath,parent_path+"/ai_1",logger=logger) \ No newline at end of file diff --git a/replace_answer_detail.py b/replace_answer_detail.py new file mode 100644 index 0000000..dc68405 --- /dev/null +++ b/replace_answer_detail.py @@ -0,0 +1,64 @@ +import json +import sys + +def load_b_file(filepath): + """读取文件 B,并返回以 id 为 key 的字典""" + b_data = {} + with open(filepath, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + # 只保留需要更新的字段 + if 'id' in data and 'answer_detail' in data: + b_data[data['id']] = data['answer_detail'] + else: + print(f"警告:文件 B 中缺少必要字段: {line}") + except json.JSONDecodeError as e: + print(f"解析失败(文件 B):{e} -> {line[:50]}...") + return b_data + +def update_a_file(a_filepath, b_dict, output_filepath): + """读取文件 A,根据 b_dict 替换 answer_detail,并写入输出文件""" + with open(a_filepath, 'r', encoding='utf-8') as fin, \ + open(output_filepath, 'w', encoding='utf-8') as fout: + + for line_num, line in enumerate(fin, start=1): + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + record_id = data.get('id') + + # 如果在 B 中有对应 id,则替换 answer_detail + if record_id in b_dict: + data['answer_detail'] = b_dict[record_id] + + # 写回文件 + fout.write(json.dumps(data, ensure_ascii=False) + '\n') + + except json.JSONDecodeError as e: + print(f"第 {line_num} 行解析失败:{e} -> {line[:50]}...") + +def main(): + if len(sys.argv) != 4: + print("用法: python replace_answer_detail.py <文件A路径> <文件B路径> <输出文件路径>") + sys.exit(1) + + a_file = sys.argv[1] + b_file = sys.argv[2] + output_file = sys.argv[3] + + print("正在加载文件 B ...") + b_dict = load_b_file(b_file) + + print(f"共加载 {len(b_dict)} 条记录。开始处理文件 A ...") + update_a_file(a_file, b_dict, output_file) + + print("处理完成!结果已保存到:", output_file) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ff61b58 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +wheel @ file:///opt/homebrew/Cellar/python%403.13/3.13.2/libexec/wheel-0.45.1-py3-none-any.whl#sha256=b9235939e2096903717cb6bfc132267f8a7e46deb2ec3ef9c5e234ea301795d0 diff --git a/reset_id.py b/reset_id.py new file mode 100644 index 0000000..55c9a83 --- /dev/null +++ b/reset_id.py @@ -0,0 +1,100 @@ +# 用于交付前重制id列表 +# 同时处理每条数据,删除问题内容前的序号 + +import argparse +import jsonlines +from tqdm import tqdm +import re + +html_tag_pattern = re.compile(r'<[^>]+>') # 用于检测 HTML 标签 +unicode_pattern = r'\\u[0-9a-fA-F]{4,}' +# 输入文件路径和输出文件路径 +def parse_args(): + parser = argparse.ArgumentParser(description="Find high Jaccard similarity entries in JSONL file") + parser.add_argument("--input", required=True, help="Input JSONL file path") + parser.add_argument("--output", required=True, help="Input JSONL file path") + parser.add_argument("--major", required=True, help="Input JSONL file path") + parser.add_argument("--start_id", type=int, default=1, help="Start ID (default: 0)") + return parser.parse_args() + + +# 获取输入文件的总行数(用于进度条) +def count_lines(file_path): + with open(file_path, "r", encoding="utf-8") as f: + return sum(1 for _ in f) +def process_file(input_file, output_file,start_id,major): + total_lines = count_lines(input_file) + # 打开输入文件进行逐行读取,打开输出文件进行逐行写入 + with jsonlines.open(input_file, mode="r") as reader, jsonlines.open(output_file, mode="w") as writer: + new_id = start_id # 初始化新的 ID 从 1 开始 + + # 使用 tqdm 包裹 reader,显示进度条 [[10]] + for line in tqdm(reader, total=total_lines, desc="Processing", unit="line"): + q_main = line.get("q_main", "") + answer_detail = line.get("answer_detail", None) + std_ans = line.get("std_ans", None) + keypoint = line.get("keypoint", None) + major_2 = line.get("major_2", None) + # 跳过条件: + # 1. q_main 以数字开头 + # 2. answer_detail 不存在 或 不是字符串 + # 3. std_ans 不存在 或 不是字符串 + # if ( + # re.match(r'^\s*\d(?!\d)', q_main) or + # not isinstance(answer_detail, str) or + # not isinstance(std_ans, str) or + # not isinstance(keypoint, str) or + # not isinstance(major_2, str) or + # html_tag_pattern.search(q_main) or + # html_tag_pattern.search(answer_detail) or + # html_tag_pattern.search(std_ans) or + # re.search(unicode_pattern, major_2) + # ): + # continue + # 修改当前行的 id 字段 + line["id"] = new_id + # line["grade"] = "研究生" + # line["major"] = major + # line["q_main"] = full_to_half(remove_prefix(line["q_main"])) + # line["answer_detail"] = full_to_half(line["answer_detail"]) + # line["std_ans"] = full_to_half(line["std_ans"]) + # 写入修改后的行到输出文件 + writer.write(line) + # 更新 ID + new_id += 1 + +def remove_prefix(question): + """ + 移除 question 字段开头的序号,例如: + - "1.", "2.", "1. ", "2. " + - "1.", "2." + - "1、" + - "1)" + - "2 题目:" + - "1题:"、"2题:" + - 处理类似 "2\\.", "3\\." 这种形式 + """ + # 正则表达式匹配各种可能的前缀 + pattern = r'^\s*\d+[\..\\))、]|\d+\s*题目:|\d+题:' + result = re.sub(pattern, '', question).lstrip() + return result +def full_to_half(text): + """将文本中的全角字符转换为半角字符""" + res = '' + for char in text: + code = ord(char) + if code == 12288: # 全角空格 + res += chr(32) + elif 65281 <= code <= 65374: # 全角标点符号 + res += chr(code - 65248) + else: + res += char + return res + + +if __name__ == "__main__": + args = parse_args() + parser = argparse.ArgumentParser(description="JSONL格式验证工具") + process_file(args.input, args.output,args.start_id,args.major) + print("ID 重置完成,已保存到新文件:", args.output) + \ No newline at end of file diff --git a/start.sh b/start.sh new file mode 100644 index 0000000..b19f0fe --- /dev/null +++ b/start.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +folderPath=$1 +fileName=$2 +major=$3 +split_size=$4 + +echo "正在进行文件预处理,请稍后" +python3 step1_pre.py "$folderPath/$fileName" "$major" "$split_size" +echo "文件预处理完成,请查看spilited_ai1文件夹中内容" + +echo "现在开始处理AI答案正确检测...第一遍" +i=0 +for file in $(ls "$folderPath"/spilited_ai1 | sort -V) +do + echo "启动任务,文件路径是:$folderPath/spilited_ai1/$file" + nohup python3 replace.py "$folderPath" "$folderPath/spilited_ai1/$file" "$i" > /dev/null 2>&1 & + i=$((i+1)) +done \ No newline at end of file diff --git a/step1_pre.py b/step1_pre.py new file mode 100644 index 0000000..942b296 --- /dev/null +++ b/step1_pre.py @@ -0,0 +1,69 @@ +import logging +import os +from pathlib import Path +import sys +from components.ai_check import get_spilit_file_list, process_jsonl_file +from components.spilit import split_jsonl +from components.step1 import generate_processed_data, load_ori_file, ori_data_validate +from step4_major import start_process_major2 + +#输入信息 +#完整文件路径 +start_file_path = sys.argv[1] +major = sys.argv[2] +split_size = sys.argv[3] + +# 获取文件名,文件路径 +p = Path(start_file_path) +ori_file_name = str(p.name) +ori_file_path = str(p.parent) +ori_file_pure_name = str(p.stem) +question_type_list = ['填空题', '解答题'] + +# 初始化文件夹路径 +os.makedirs(ori_file_path+"/transformed", exist_ok=True) +os.makedirs(ori_file_path+"/major2_processed", exist_ok=True) +os.makedirs(ori_file_path+"/spilited_ai1", exist_ok=True) +os.makedirs(ori_file_path+"/ai_1", exist_ok=True) +os.makedirs(ori_file_path+"/ai_2", exist_ok=True) +os.makedirs(ori_file_path+"/ai_2_total", exist_ok=True) +os.makedirs(ori_file_path+"/log", exist_ok=True) + +# 配置日志记录器 +logging.basicConfig( + level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别) + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式 + filename=ori_file_path+"/log"+ori_file_pure_name+'.log', # 将日志写入文件 + filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加) +) + +# 创建日志记录器 +logger = logging.getLogger(__name__) + +def process_wrapper(args): + ai1_file_path, output_dir, logger = args + process_jsonl_file(ai1_file_path, output_dir, logger) + +if __name__ == '__main__': + # 非json数据过滤 + # 题型过滤 + # 【如图】关键字过滤 + # 简单回答过滤 + # filtered_data_list = load_ori_file(start_file_path,question_type_list) + # 数据格式转换 + # generate_processed_data(filtered_data_list,major,ori_file_path+"/transformed/"+ori_file_pure_name+".processed") + # Major_2数据过滤 + # start_process_major2(ori_file_path+"/transformed/"+ori_file_pure_name+".processed",ori_file_path+"/major2_processed/"+ori_file_pure_name+".processed",major) + # 文件大小分割 + split_jsonl(ori_file_path+"/major2_processed/"+ori_file_pure_name+".processed", + ori_file_path+"/spilited_ai1/"+ori_file_pure_name, chunk_size=int(split_size)) + # AI过滤第一遍 + # ai1_file_list = get_spilit_file_list(ori_file_path+"/spilited/") + # for ai1_file_path in ai1_file_list: + # process_jsonl_file(ai1_file_path,ori_file_path+"/ai_1/",logger) + # AI过滤第二遍 + + + # 整合结果和日志 + + \ No newline at end of file diff --git a/step2_ai1.py b/step2_ai1.py new file mode 100644 index 0000000..c7782e7 --- /dev/null +++ b/step2_ai1.py @@ -0,0 +1,30 @@ + + +import logging +import sys + +from components.ai_check import process_jsonl_file + +# 调用方法 +# python3 step1_ai1.py parent_path output_path file_path file_index + +parent_path = sys.argv[1] +file_path = sys.argv[2] +file_index = sys.argv[3] + + +# 配置日志记录器 +logging.basicConfig( + level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别) + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式 + filename=parent_path+"/log/ai1_log_"+str(file_index)+'.log', # 将日志写入文件 + filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加) +) +# 创建日志记录器 +logger = logging.getLogger(__name__) + +if __name__ == '__main__': + # # 读取数据文件 + print("Start") + temp_filepath = file_path + process_jsonl_file(temp_filepath,parent_path+"/ai_1",logger=logger) \ No newline at end of file diff --git a/step3_ai2.py b/step3_ai2.py new file mode 100644 index 0000000..bb31e58 --- /dev/null +++ b/step3_ai2.py @@ -0,0 +1,30 @@ + + +import logging +import sys + +from components.ai_check import process_jsonl_file + +# 调用方法 +# python3 step1_ai1.py parent_path output_path file_path file_index + +parent_path = sys.argv[1] +file_path = sys.argv[2] +file_index = sys.argv[3] + + +# 配置日志记录器 +logging.basicConfig( + level=logging.DEBUG, # 设置日志级别为 DEBUG(最低级别) + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # 日志格式 + filename=parent_path+"/log/ai2_log_"+str(file_index)+'.log', # 将日志写入文件 + filemode='a' # 写入模式('w' 表示覆盖,'a' 表示追加) +) +# 创建日志记录器 +logger = logging.getLogger(__name__) + +if __name__ == '__main__': + # # 读取数据文件 + print("Start") + temp_filepath = file_path + process_jsonl_file(temp_filepath,parent_path+"/ai_2",logger=logger) \ No newline at end of file diff --git a/step4_major.py b/step4_major.py new file mode 100644 index 0000000..dd87c17 --- /dev/null +++ b/step4_major.py @@ -0,0 +1,68 @@ +# 用于交付前重制id列表 + +import argparse +import jsonlines +from tqdm import tqdm +import re + +# 输入文件路径和输出文件路径 +def parse_args(): + parser = argparse.ArgumentParser(description="处理Major2") + parser.add_argument("--input", required=True, help="Input JSONL file path") + parser.add_argument("--output", required=True, help="Input JSONL file path") + parser.add_argument("--major", required=True, help="输入科目") + return parser.parse_args() + + +# 获取输入文件的总行数(用于进度条) +def count_lines(file_path): + with open(file_path, "r", encoding="utf-8") as f: + return sum(1 for _ in f) +def process_file(input_file, output_file,major2_keywords): + total_lines = count_lines(input_file) + # 打开输入文件进行逐行读取,打开输出文件进行逐行写入 + with jsonlines.open(input_file, mode="r") as reader, jsonlines.open(output_file, mode="w") as writer: + # 使用 tqdm 包裹 reader,显示进度条 [[10]] + for line in tqdm(reader, total=total_lines, desc="Processing", unit="line"): + # 判断major2是否有效 + if line.get("major_2"): + if any(keyword in line["major_2"] for keyword in major2_keywords): + # 去除line["major_2"]开头的空格 + line["major_2"] = line["major_2"].lstrip() + # 判断line["major_2"]中是否包含英文字符 + if any('a' <= char.lower() <= 'z' for char in line["major_2"]): + print("major2中包含英文字符,请检查!内容是:"+str(line)) + continue + # 写入修改后的行到输出文件 + # 去除*、/、\、!、空格符号 + line["major_2"] = line["major_2"].replace('*', '').replace('/', '').replace('\\', '').replace('!', '').replace(' ', '') + # 去除括号及其内容 + line["major_2"] = re.sub(r'$$|$$|(|)|$|$|{|}|〈|〉|《|》|『|』|【|】|〖|〗|〘|〙|〚|〛', '', line["major_2"]) + writer.write(line) + else: + print("major2不合法,跳过该行,内容是:"+str(line)) + continue + + + +def start_process_major2(input,output,major): + major2_keywords = [] + if major == "化学": + #化学、有机、无机、分子 + major2_keywords = ["有机","无机","分子","化学"] + elif major == "物理": + #物理、天体、宇宙、行星、黑洞、原子、核、力、动力、力学、流体、光、光子、电、电子、磁、量子、超导、热、纳米、晶体、半导体、能量、相对、波动、振动 + major2_keywords = ["天体","宇宙","行星","黑洞","原子","核","力","动力","力学","流体","光","光子","电","电子","磁","量子","超导","热","纳米","晶体","半导体","能量","相对","波动","振动","物理"] + else: + print("请输入正确的科目!") + exit() + process_file(input, output,major2_keywords) + print("major2清理完成,已保存到新文件:", output) + + +if __name__ == "__main__": + args = parse_args() + parser = argparse.ArgumentParser(description="JSONL格式验证工具") + start_process_major2(args.input,args.output,args.major) + + \ No newline at end of file