# coding: utf-8 # In[1]: #import packages: import os from os import listdir import sys import pandas as pd ##specify path and define directory with all the queries path= "/Users/asd/CODES/test/proteins_human" protein_directory=os.listdir(path) #iterate each protein_folder and select new_exonerate_final_scaffold.gff files for protein_folder in protein_directory: if protein_folder == '.DS_store': pass else: files=os.listdir('{}/{}'.format(path,protein_folder)) #select files with exonerate fnal inside exonerate_files= [k for k in files if 'new_exonerate_final_' in k] #opens and reads files for file_ in exonerate_files: if os.path.getsize('{}/{}/{}'.format(path,protein_folder,file_))== 0: pass else: exonerate_file=open('{}/{}/{}'.format(path,protein_folder,file_)).read() #define_name and fake start #print(exonerate_file) name=exonerate_file.split(':',1)[0] fake_start=int(exonerate_file.split('subseq(',1)[1].split(',',1)[0]) #print(name,fake_start) exonerate_exons=exonerate_file.splitlines() #print(exonerate_exons) sequence_starts=[] sequence_ends=[] for exon in exonerate_exons: if exon == '': pass else: #print(exon) tabulated=exon.split('\t') exon_start= int(tabulated[3]) exon_end=int(tabulated[4]) sequence_starts.append(exon_start) sequence_ends.append(exon_end) sequence_starts_sorted=sorted(sequence_starts) sequence_ends_sorted=sorted(sequence_ends) #print(sequence_starts_sorted,sequence_ends_sorted) real_start=fake_start+sequence_starts_sorted[0] end=real_start+sequence_ends_sorted[-1]-sequence_starts_sorted[0] print('%s \t %s \t %s \t %s \t'%(protein_folder,name,real_start,end)) # In[ ]: # In[ ]: