Simple UniProt Readout using Python

Hey everyone,

recently, I have had a long list of several thousand UniProt IDs and wanted to download all kinds of information from the UniProt database for each ID. In order to do this I wrote a function in python. Of course I know that there are several solutions out there already, however, I found them rather complicated. This function is very simple to use and should be applicable for every user with a basic python knowledge. In principle I used urllib2 to access the UniProt REST server to download the entries in a xml format. Thereafter, I simply read out all the information I wanted to obtain and returned it as a list of lines.

If you want to use my function, all you have to do is to download the code below, paste it into your python program and use it. Just call the function “get_uniprot_data(id_list)” and give it a list of UniProt IDs. That’s all. My function will return: NCBI Gene ID; Protein Full Names; Protein Short Names; Primary Gene Name; Synonym Gene Name; GO-Terms; GO_Term Names; UniProt Protein Function; DOI of Related Paper; UniProt Keywords; Protein Sequence.

Basically, you can readout every information you can find on the website, simply add the appropriate code. If you do not know how to do that, let me know what additional information you need and I might implement the code for you. Have fun!

This code works with Python 2

EDIT:

I know that this code could be better, however, it works. Maybe it is possible to implement a batch download. Furthermore, I really do not know whether this is the best way to access data from UniProt. I just needed a simple and fast solution for my needs and I wanted to share this with you. If you have any suggestions, please let me know!

import urllib2
import re
def get_uniprot_data(id_list):
    max_number = len(id_list)
    uid = ["NCBI GeneID"]
    full_name = ["Protein_Full_Names"]
    short_name = ["Protein_Short_Names"]
    primary_gene_name = ["Primary_Gene_Name"]
    synonym_gene_name = ["Synonym_Gene_Name"]
    go_terms = ["GO_Terms"]
    go_term_names = ["GO_Term_Names"]
    uniprot_keywords = ["Uniprot_keywords"]
    uniprot_paper_doi = ["Paper_DOI"]
    protein_sequence = ["Protein_Sequence"]
    uniprot_function = ["Uniprote_Function"]
    counter = 1
    for i in id_list:
        try:
            uniprot_id = i
            path = "http://www.uniprot.org/uniprot/" + uniprot_id + ".xml"
            html = urllib2.urlopen(path)
            print(path +"   " + "----->" + "   " + str(counter) +  " / " + str(max_number))
            uid_temp = []
            full_name_temp = []
            short_name_temp = []
            primary_gene_name_temp = []
            synonym_gene_name_temp = []
            go_temp = []
            go_term_names_temp = []
            uniprot_function_temp = []
            uniprot_paper_doi_temp = []
            protein_sequence_temp = []
            uniprot_keywords_temp = []
            cycler = 0
            sequence_cycler = 0
            for line in html:
                if '<dbReference type="GeneID"' in line:
                    temp = line
                    temp = re.sub('<[^"]+"', '', temp)
                    temp = re.sub('G[^"]+"', '', temp)
                    temp = re.sub(' [^"]+"', '', temp)
                    temp = re.sub('"[^>]+>', '', temp)
                    temp = temp.rstrip()
                    uid_temp.append(temp)
                if 'type="GO"' in line:
                    go_temp.append(line[27:37])
                if '<property type="term"' in line:
                    go_term_names_temp.append(line[29:-4])
                if '<comment type="function">' in line:
                    cycler = 1
                if '</comment>' in line:
                    cycler = 0
                while cycler == 1:
                    temp = line
                    temp = re.sub('<[^>]+>', '', temp)
                    temp = temp.rstrip()
                    uniprot_function_temp.append(temp)
                    break
                if '<dbReference type="DOI" id="' in line:
                    uniprot_paper_doi_temp.append(line[28:-4])
                if '<keyword id=' in line:
                    temp = line
                    temp = re.sub('<[^>]+>', '', temp)
                    temp = temp.rstrip()
                    uniprot_keywords_temp.append(temp)
                if '<fullName' in line:
                    temp = line
                    temp = re.sub('<[^>]+>', '', temp)
                    temp = temp.rstrip()
                    full_name_temp.append(temp)
                if '<shortName' in line:
                    temp = line
                    temp = re.sub('<[^>]+>', '', temp)
                    temp = temp.rstrip()
                    short_name_temp.append(temp)
                if '<name type="primary' in line:
                    temp = line
                    temp = re.sub('<[^>]+>', '', temp)
                    temp = temp.rstrip()
                    primary_gene_name_temp.append(temp)
                if '<name type="synonym' in line:
                    temp = line
                    temp = re.sub('<[^>]+>', '', temp)
                    temp = temp.rstrip()
                    synonym_gene_name_temp.append(temp)
                if '</sequence>' in line:
                    sequence_cycler = 0
                while sequence_cycler == 1:
                    temp = line
                    protein_sequence_temp.append(line)
                    break
                if '<sequence length=' in line:
                     sequence_cycler = 1

            uid_temp = ' ; '.join(uid_temp)
            full_name_temp = ' ; '.join(full_name_temp)
            short_name_temp = ' ; '.join(short_name_temp)
            primary_gene_name_temp = ' ; '.join(primary_gene_name_temp)
            synonym_gene_name_temp = ' ; '.join(synonym_gene_name_temp)
            uniprot_keywords_temp = ' ; '.join(uniprot_keywords_temp)
            uniprot_paper_doi_temp = ' ; '.join(uniprot_paper_doi_temp)
            uniprot_function_temp = ''.join(uniprot_function_temp)
            go_temp = ' ; '.join(go_temp)
            go_term_names_temp = ' ; '.join(go_term_names_temp)
            protein_sequence_temp = ''.join(protein_sequence_temp)
            protein_sequence_temp = re.sub('\n','',protein_sequence_temp)
            
            uid.append(uid_temp)
            full_name.append(full_name_temp)
            short_name.append(short_name_temp)
            primary_gene_name.append(primary_gene_name_temp)
            synonym_gene_name.append(synonym_gene_name_temp)
            go_terms.append(go_temp)
            go_term_names.append(go_term_names_temp)
            uniprot_function.append(uniprot_function_temp)
            uniprot_paper_doi.append(uniprot_paper_doi_temp)
            uniprot_keywords.append(uniprot_keywords_temp)
            protein_sequence.append(protein_sequence_temp)
            counter = counter + 1

        except:
            uid.append("NA")
            full_name.append("NA")
            short_name.append("NA")
            primary_gene_name.append("NA")
            synonym_gene_name.append("NA")
            go_terms.append("NA")
            go_term_names.append("NA")
            uniprot_function.append("NA")
            uniprot_paper_doi.append("NA")
            uniprot_keywords.append("NA")
            protein_sequence.append("NA")

        final_output = []
        final_output.append(uid)
        final_output.append(full_name)
        final_output.append(short_name)
        final_output.append(primary_gene_name)
        final_output.append(synonym_gene_name)
        final_output.append(go_terms)
        final_output.append(go_term_names)
        final_output.append(uniprot_function)
        final_output.append(uniprot_paper_doi)
        final_output.append(uniprot_keywords)
        final_output.append(protein_sequence)
        
        final_temp = []
        for x in zip(*final_output):
            temp = []
            for y in x:
                temp.append(y)
            final_temp.append(temp)
        
        final_output = final_temp

    
    return final_output