数据预处理-PDB文件

以下代码为个人原创,python实现,是处理PDB文件的部分常用代码,仅供参考!

1.下载PDB文件

下面是一个下载PDB文件的函数,传入的参数是一个写有pdb名字的namefile文件,函数的核心部分是三个系统命令,先通过wget下载,然后解压,最后替换名字。

1
2
3
4
5
6
7
def downloadpdb(namefile):
inputfile = open(namefile, 'r')
for eachline in inputfile:
pdbname = eachline.lower().strip()
os.system("wget http://ftp.wwpdb.org/pub/pdb/data/structures/all/pdb/pdb" + pdbname + ".ent.gz")
os.system("gzip -d pdb" + pdbname + '.ent.gz')
os.system("mv pdb" + pdbname + ".ent " + pdbname.upper() + '.pdb')

测试用例

1
2
os.chdir('/ifs/home/liudiwei/datasets/RPdatas')
downloadpdb('protein.name')

2.PDB转DSSP

将下载的PDB文件转成DSSP文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 处理一行dssp数据
def formatdsspline(dsspline):
eachline = dsspline
col = '\t' + eachline[0:5]
col += '\t' + eachline[5:10]
col += '\t' + eachline[10:12]
col += '\t' + eachline[12:15]
col += '\t' + eachline[15:25]
col += '\t' + eachline[25:39]
col += '\t' + eachline[29:34]
col += '\t' + eachline[34:38]
col += '\t' + eachline[38:50]
col += '\t' + eachline[50:61]
col += '\t' + eachline[61:72]
col += '\t' + eachline[72:83]
col += '\t' + eachline[83:92]
col += '\t' + eachline[92:97]
col += '\t' + eachline[97:103]
col += '\t' + eachline[103:109]
col += '\t' + eachline[109:115]
col += '\t' + eachline[115:122]
col += '\t' + eachline[122:129]
col += '\t' + eachline[129:136]
return col

PDB转DSSP格式,需要DSSP软件

参数:

  • pdbdir: pdb文件目录
  • dsspdir: 生成的dssp文件目录(需创建)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def pdbToDSSP(pdbnamefile,pdbdir, dsspdir):    
pdbfiles = os.listdir(pdbdir)
#对于每个pdb文件,生成对应的dssp文件,并保存在dssp目录下
for pdb_file in pdbfiles:
pdb_name = pdb_file.split('.')[0].upper()
command = 'DSSPCMBI.EXE -x ' + pdbdir +'/'+ pdb_file + ' '+ dsspdir +"/"+ pdb_name +'.dssp'
os.system(command)
dsspfiles = os.listdir(dsspdir)
if os.path.exists(dsspdir + "/DSSP"): #判断DSSP文件是否存在,存在则删除
dsspfiles.remove("DSSP")
output=open(dsspdir + '/DSSP','w')
#循环读取dssp文件,将其合并成一个整的DSSP
with open(pdbnamefile, 'r') as namefile:
for eachline in namefile:
pdb_name = eachline.strip()
dssp_file = pdb_name + '.dssp'
#for dssp_file in dsspfiles:
#pdb_name = dssp_file.split('.')[0]
with open(dsspdir + '/' + dssp_file,"r") as f:
if not os.path.isdir(dsspdir + '/format'):
os.mkdir(dsspdir + '/format')
with open(dsspdir + '/format/' + pdb_name + '.dssp.format','w') as singleOut:
count = 0; preRes=[]
sets = set('');content=''
for eachline in f.readlines():
list1=[];oneline=[]
count+=1
list1.append(pdb_name)
if count >= 29:
eachline = formatdsspline(eachline)
oneline = eachline.split('\t')

if oneline[3].strip():
preRes = oneline[3].strip()

list1.append(eachline)
content += "".join(list1)+'\n'

if '!' == oneline[4].strip():
continue

if '!*' in eachline or not oneline[3].strip():
if preRes in sets and len(sets):
content=''
preRes=[]
continue
sets.add(preRes)
output.write(content)
singleOut.write(content)
content=''
if preRes and preRes not in sets:
output.write(content)
singleOut.write(content)
output.close()

测试

1
2
3
4
5
#test
pdbdir = 'z:/datasets/protein/pdb'
dsspdir = 'Z:/datasets/protein/DSSPdir'
proname = 'Z:/datasets/protein/protein.name'
pdbToDSSP(proname,pdbdir, dsspdir)

3.DSSP抽取序列

从一个整合的DSSP文件中抽取序列文件

从格式化后的dssp文件获取序列信息

参数:dsspfile为格式过的DSSP文件,seqfile为输出的序列文件,同时输出序列文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def getSeqFromDSSP(dsspfile, seqfile, minLen):
with open(dsspfile, 'r') as inputfile:
if not seqfile.strip():
seqfile = 'protein'+minLen + '.dssp.seq'
outchain = open('protein40.chain.all', 'w')
with open(seqfile, 'w') as outputfile:
residue=[];Ntype=[]
preType=[];preRes=[]
firstline=[];secondline=[];content=''
for eachline in inputfile:
oneline = eachline.split('\t')
residue = oneline[0]
if not residue.strip():
continue
Ntype = oneline[3].strip()
if not Ntype.strip():
continue
if preRes!=residue:
content = ''.join(firstline)+'\n' + ''.join(secondline) +'\n'
if len(secondline) >=minLen and not 'X' in secondline:
outchain.write(''.join(firstline) + '\n')
outputfile.write(content)
firstline=[]
firstline.append('>' + residue + ':' + Ntype)
secondline=[];secondline.append(oneline[4].strip())
preRes = residue;preType = Ntype
continue
if Ntype != preType:
content = ''.join(firstline)+'\n' + ''.join(secondline) +'\n'
if len(secondline) >=minLen and not 'X' in secondline:
outchain.write(''.join(firstline) + '\n')
outputfile.write(content)
firstline=[]
firstline.append('>' + residue + ':' + Ntype)
secondline=[];secondline.append(oneline[4].strip())
preRes = residue;preType = Ntype
else: #如果Ntype不为空,且等于preType
secondline.append(oneline[4].strip())
content = ''.join(firstline)+'\n' + ''.join(secondline) +'\n'
if len(secondline) >=minLen and not 'X' in secondline: #选择长度大于40
outchain.write(''.join(firstline) + '\n')
outputfile.write(content)
outchain.close()

测试:

1
2
3
4
os.chdir(r"E:\3-CSU\Academic\_Oriented\analysis\experiment\Datasets\ptest.dssp")
pdbfile = 'DSSP'
outfile = 'protein.seq'
getSeqFromDSSP(pdbfile,outfile)

4.对序列做blast聚类

设置相应的参数,在服务器上跑blast,代码如下:

1
ifs/share/lib/cd-hit-v4.5.4/cd-hit -i /ifs/home/liudiwei/datasets/protein40.seq -o /ifs/home/liudiwei/experiment/cdhit/fasta.40 -c 0.4 -n 2 -M 2000

5.生成聚类后的DSSP,得到protein.name、protein.seq、protein.chain三个文件

从原来的DSSP文件中,根据聚类后的链名抽取新的DSSP文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
 # extract dssp from old dssp file
def extractDSSP(dsspfile, chainname, outfile):
with open(outfile, 'w') as outdssp:
with open(dsspfile, 'r') as inputdssp:
for eachline in inputdssp:
oneline = eachline.split('\t')
#preNum = oneline[2].strip()
with open(chainname,'r') as chainfile:
for eachchain in chainfile:
protein_ame = eachchain[1:5]
chain_id = eachchain[6:7]
if oneline[0].strip() == protein_ame and oneline[3].strip() == chain_id:
outdssp.write(eachline)
break

测试实例:

1
2
3
4
dsspfile = '/ifs/home/liudiwei/datasets/832.protein/DSSPdir1/DSSP'
chainname = '/ifs/home/liudiwei/experiment/step1/832p.cluster/cdhit/protein.chain'
outfile = '/ifs/home/liudiwei/experiment/step1/832p.cluster/cdhit/DSSP'
extractDSSP(dsspfile, chainname, outfile )