1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
| import os import random import openpyxl import csv import time import requests from bs4 import BeautifulSoup import re
os.chdir('C:/Users/Administrator/Desktop')
res = open('植物病原互作通路基因蛋白爬取swiss数据库结果.txt','w+',encoding = "utf-8")
original_file = open('植物病原互作通路基因蛋白swiss数据库blast结果.txt','r')
for line in original_file.readlines()[0:]: gene = line.split('\t')[0] pro = line.split('\t')[1] pro_id = pro.split('.')[0] similarity = line.split('\t')[2] if float(similarity) >= 70: evalue = line.split('\t')[10] score = line.split('\t')[11].replace('\n','')
spider_link = 'https://www.uniprot.org/uniprot/' + pro_id
res_spider = requests.get(spider_link)
soup = BeautifulSoup(res_spider.text,'html.parser')
organism = soup.find('div',id = 'content-organism', class_ = 'entry-overview-content').get_text() gene_1 = soup.find('div',id = 'content-gene', class_ = 'entry-overview-content').get_text() protein = soup.find('div',id = 'content-protein', class_ = 'entry-overview-content').get_text() status = soup.find('div', id = 'content-status',class_ = 'entry-overview-content').find('span', class_ = 'context-help tooltipped-click').get_text() res_str = re.findall('<p>(.*?)</p>',status) status = status.replace(res_str[0],'').replace('\n','.').replace(' <p></p>','').replace('-','').replace('leveli','level')
if False: try: function = soup.find('div', class_ = 'annotation').find('span').get_text().replace('By similarity','') except AttributeError: function = 'None'
res.writelines([gene,'\t',pro,'\t',similarity,'\t',evalue,'\t',score,'\t',organism,'\t',gene_1,'\t',protein,'\t',status,'\n']) else: next
res.close()
|