#!/usr/bin/perl -w # Bruno Contreras-Moreira, CCG/UNAM, March 2005 # # program: modelcomp.pl # arguments: alignfile templatePDBfile # file formats: ## 1) alignfile (FASTA format, aligned sequence in one line, - are gaps, query on top, template below): #>query name #MARRKRRNFNKQATEIL---LSEHRK #>templ name PDBchain #------KNATRESTSTLKAWLSEHRK # ## 2) templatePDBfile (PDB format, sequence must be the same as in alignfile, only ATOM records are read) #ATOM 2236 OXT ALA P 78 -17.798 -2.102 -26.704 1.00 15.00 O #ATOM 2237 H ALA P 78 -17.947 0.175 -29.159 1.00 15.00 H #ATOM 2238 HA ALA P 78 -17.762 -2.004 -29.785 1.00 15.00 H #ATOM 2239 1HB ALA P 78 -15.020 -1.302 -28.901 1.00 15.00 H # # # exercise: read and understand the code and only then complete the mutateResidue function, see below use strict; ## declare here the important variables ####################################### my $progname = "modelcomp.pl"; my ($alignfile,$templatePDBfile); my ($templseq,$templseq_nogaps,$templname,$templ_chain,$templ_atom); my (@templPDBcoordinates,$n_of_templ_res_Align,$n_of_templ_res_PDB); my ($queryseq,$queryname); my (%aa3to1,%aa1to3); # to be filled by read_amino_codes(), see below ## check the number of arguments, it needs two ################################# if(scalar(@ARGV) < 2) { die "usage: $progname \n"; # die means print this and exit } else { ($alignfile,$templatePDBfile) = @ARGV; # read the first two arguments, ignore the rest } ## read and parse alignfile #################################################### my $seqname; open(ALIGN,$alignfile) || die "# $progname : cannot read $alignfile, sorry\n"; while() { chomp; # delete newline characters if(/^>/) # if line starts with '>' { my @line = split(/\s+/,$_); # split this line in words separated by 1 or + blanks $seqname = $line[0]; if($seqname eq ">templ") { $templ_chain = $line[2]; $templname = $line[1]; } else{ $queryname = $line[1]; } } else { if($seqname eq ">templ") { $templseq = $_; } else { $queryseq = $_; } } } close(ALIGN); $templseq_nogaps = $templseq; $templseq_nogaps =~ s/-//g; # remove gaps to calculate sequence length $n_of_templ_res_Align = length($templseq_nogaps); print "# $progname : parsed alignment (aligned template contains $n_of_templ_res_Align residues)\n"; print "#>query $queryname\n#$queryseq\n"; print "#>templ $templname $templ_chain\n#$templseq\n\n"; ## read and parse templatePDBfile ############################################### my $readPDBchain; $n_of_templ_res_PDB = 0; open(PDB,$templatePDBfile) || die "# $progname : cannot read $templatePDBfile, sorry\n"; while() { $readPDBchain = substr($_,21,1); # PDB chain identifier is at column 22, extracted with substr if(/^ATOM/ && $readPDBchain eq $templ_chain) # read lines starting with 'ATOM' with the wanted chain { $templ_atom = substr($_,13,4); if($templ_atom =~ /CA/) # count number of PDB residues (Calpha atoms, lines with 'CA') { $n_of_templ_res_PDB++; } # take only lines with backbone atoms, where backbone is defined by # four atoms, Calpha & C,O,N, that form the peptide bond if($templ_atom =~ /N / || $templ_atom =~ /CA/ || $templ_atom =~ /C /|| $templ_atom =~ /O /) { push(@templPDBcoordinates,$_); # put this line in the back of the array } } } close(PDB); print "# $progname : parsed template contains $n_of_templ_res_PDB residues\n\n"; ## check that template sequence has same length in alignment & PDB template ###### if($n_of_templ_res_PDB != $n_of_templ_res_Align) { die "# $progname : template sequence has different length in alignment and PDB file\n"; } ## now build comparative model of backbone following the alignment ######################### &read_amino_codes(); # it will be needed in mutateResidue my ($r,$templAA,$queryAA,@PDBresidue,@backbonePDBModel,$n_of_residue); $n_of_residue = 0; for($r=0;$r<$n_of_templ_res_Align;$r++) { $templAA = substr($templseq,$r,1); # take substring of $templseq on length 1 starting at $r $queryAA = substr($queryseq,$r,1); ## aligned position if($templAA ne "-" && $queryAA ne "-") { # this function gets one residue from @templPDBcoordinates, # as indicated by $templAA and $n_of_residue, and mutates it # returns an empty array if anything goes wrong @PDBresidue = mutateResidue($n_of_residue,$templAA,$queryAA,@templPDBcoordinates); if(!@PDBresidue) { die "# $progname : alignment and PDB template sequences do not match\n"; } push(@backbonePDBModel,@PDBresidue); $n_of_residue++; # move one residue forward } ## gap in query if($queryAA eq "-") { $n_of_residue++; # skip one PDB residue } } print "# $progname : PDB backbone model of $queryname\n\n"; print "HEADER $queryname modelled using $templname as template\n"; print "REMARK calculated by $progname\n"; print @backbonePDBModel; print "TER\n"; ###################################### functions and procedures ############################ ############################################################################################# sub read_amino_codes() { # sets the 3 to 1 and 1 to 3 amino acid names $aa3to1{"ALA"} = "A"; $aa3to1{"CYS"} = "C"; $aa3to1{"ASP"} = "D"; $aa3to1{"GLU"} = "E"; $aa3to1{"PHE"} = "F"; $aa3to1{"GLY"} = "G"; $aa3to1{"HIS"} = "H"; $aa3to1{"ILE"} = "I"; $aa3to1{"LYS"} = "K"; $aa3to1{"LEU"} = "L"; $aa3to1{"MET"} = "M"; $aa3to1{"ASN"} = "N"; $aa3to1{"PRO"} = "P"; $aa3to1{"GLN"} = "Q"; $aa3to1{"ARG"} = "R"; $aa3to1{"SER"} = "S"; $aa3to1{"THR"} = "T"; $aa3to1{"VAL"} = "V"; $aa3to1{"TRP"} = "W"; $aa3to1{"TYR"} = "Y"; $aa1to3{"A"} = "ALA"; $aa1to3{"C"} = "CYS"; $aa1to3{"D"} = "ASP"; $aa1to3{"E"} = "GLU"; $aa1to3{"F"} = "PHE"; $aa1to3{"G"} = "GLY"; $aa1to3{"H"} = "HIS"; $aa1to3{"I"} = "ILE"; $aa1to3{"K"} = "LYS"; $aa1to3{"L"} = "LEU"; $aa1to3{"M"} = "MET"; $aa1to3{"N"} = "ASN"; $aa1to3{"P"} = "PRO"; $aa1to3{"Q"} = "GLN"; $aa1to3{"R"} = "ARG"; $aa1to3{"S"} = "SER"; $aa1to3{"T"} = "THR"; $aa1to3{"V"} = "VAL"; $aa1to3{"W"} = "TRP"; $aa1to3{"Y"} = "TYR"; } ########################################################################## # This function gets one residue from @templPDBcoordinates, # as indicated by $templAA and $n_of_residue, and mutates it # returns an empty array if anything goes wrong. # To complete the code you will need to have a look at the PDB format (see above) # understand that residues (groups ot atoms) are identified by the substring # that includes the name, chain and residue number, such as "ALA P 78 ". This # substring can be defined as substr($atomline,17,10) # see here for full PDB format http://www.rcsb.org/pdb/docs/format/pdbguide2.2/guide2.2_frame.html sub mutateResidue { my ($n_of_residue,$templAA,$queryAA,@templPDBcoordinates) = @_; my ($r,$lastr,$currentRes,$previousRes,$residues_so_far,$mutatedPDBatom,@PDBresidue); ## 1) find residue $n_of_residue in @templPDBcoordinates $residues_so_far = 0; $r = 0; $lastr = scalar(@templPDBcoordinates); # scalar gives the size of the array while($r<$lastr) { $currentRes = substr($templPDBcoordinates[$r],17,10); if($r>0 && $currentRes ne $previousRes) # new residue found { # aquí falta código (2 líneas) } $previousRes = $currentRes; # remembers previous residue $r++; } ## 2) check the found residue is the same as $templAA (use $aa3to1) $currentRes = substr($templPDBcoordinates[$r],17,3); # aquí falta código (una o dos líneas) ## 3) mutate the residue to $queryAA (use $aa1to3) $currentRes = substr($templPDBcoordinates[$r],17,10); while($r<$lastr && substr($templPDBcoordinates[$r],17,10) eq $currentRes) { # aquí falta código (2 líneas) $r++; } ## 4) finally, return the mutated residue return @PDBresidue; } ##########################################################################