- 论坛徽章:
- 3
|
你好,
在分析一个巨大的文件
[root@Dev-Mysql clinvar]# sed -i '4,251d' ClinVarFullRelease_2017-06.xml
4和251这两个行号是我一个循环中获得的变量。
对于这个巨大的文件,是个xml文件,xml的格式很规范,
我需要将它截断成数万个小文件。
<特定标识>
...
...
</特定标识>
[root@Dev-Mysql clinvar]# cat analyzer.sh
#!/bin/bash
SED=/bin/sed
GREP=/bin/grep
HEAD=/usr/bin/head
OBJ_F=/data/clinvar/ClinVarFullRelease_2017-06.xml
RES_D=/data/clinvar/individual1706/
MID_F=/data/clinvar/individual1706/.mid.txt
${HEAD} -10000 ${OBJ_F} > ${MID_F}
CVS_ID=`${GREP} "<ClinVarSet" ${MID_F} | ${HEAD} -1 | ${SED} 's/^.*=//' | ${SED} 's/"//g' | ${SED} 's/>//'`
STR_N=`${GREP} -n "<ClinVarSet" ${MID_F} | ${HEAD} -1 | ${SED} s/:.*$//`
END_N=`${GREP} -n "</ClinVarSet" ${MID_F} | ${HEAD} -1 | ${SED} s/:.*$//`
echo $STR_N
echo $END_N
until [ -z ${STR_N} ]
do
CVS_ID=`${GREP} "<ClinVarSet" ${MID_F} | ${HEAD} -1 | ${SED} 's/^.*=//' | ${SED} 's/"//g' | ${SED} 's/>//'`
echo $CVS_ID
RES_F=${RES_D}${CVS_ID}.xml
${SED} -n "${STR_N},${END_N}p" ${MID_F} > ${RES_F}
${SED} -i "${STR_N},${END_N}d" ${OBJ_F}
${HEAD} -10000 ${OBJ_F} > ${MID_F}
STR_N=`${GREP} -n "<ClinVarSet" ${MID_F} | ${HEAD} -1 | ${SED} s/:.*$//`
echo $STR_N
END_N=`${GREP} -n "</ClinVarSet" ${MID_F} | ${HEAD} -1 | ${SED} s/:.*$//`
done
|
|