1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
| import os import random import openpyxl import csv import time import requests from bs4 import BeautifulSoup import re
os.chdir('C:/Users/Administrator/Desktop')
with open('temp.txt','r') as f: for line in f: acuce_gene_id = line.split('\t')[1].split('.')[0] acuce_prot_id = line.split('\t')[1] match_uniprot_id = line.split('\t')[2] uniprot_link = 'https://www.uniprot.org/uniprot/' + match_uniprot_id.split('.')[0]
res = requests.get(uniprot_link)
soup = BeautifulSoup(res.text,'html.parser')
organism = soup.find('div',id = 'content-organism', class_ = 'entry-overview-content').get_text() gene = soup.find('div',id = 'content-gene', class_ = 'entry-overview-content').get_text() protein = soup.find('div',id = 'content-protein', class_ = 'entry-overview-content').get_text() res_temp = {'Q9UUH7':[organism,gene,protein]}
function = soup.find('div', class_ = 'annotation').find('span').get_text().replace('By similarity','')
go_mol = soup.find('ul', class_ = 'noNumbering molecular_function') if str(type(go_mol)) != "<class 'NoneType'>": for i in go_mol.find_all('a'): link = i['href'] link_split = link.split('/') go_item_id = link_split[len(link_split)-1] if go_item_id.split(':')[0] != 'GO': continue else: go_item = i.get_text()
res_excel = openpyxl.Workbook() sheet = res_excel.active sheet.title = '爬虫结果' col_name = ['月亮谷基因编号','月亮谷转录本编号','匹配的UniProt编号','蛋白名称','物种','基因','功能','GO ID','GO Description','Link'] sheet.append(col_name) res_new = [acuce_gene_id,acuce_prot_id,match_uniprot_id,protein,organism,gene,function,go_item_id,go_item_id,link] sheet.append(res_new) file_name = os.getcwd() + '/results/' + 'GO_Biological_process_' + acuce_prot_id + '_' + match_uniprot_id + '_' +go_item_id.replace(':','_') + '.xlsx' res_excel.save(file_name)
go_bio = soup.find('ul', class_ = 'noNumbering biological_process') if str(type(go_bio)) != "<class 'NoneType'>": for i in go_bio.find_all('li'): link = i.find('a')['href'] link_split = link.split('/') go_item_id = link_split[len(link_split)-1] if go_item_id.split(':')[0] != 'GO': continue else: go_item = i.find('a').get_text()
res_excel = openpyxl.Workbook() sheet = res_excel.active sheet.title = '爬虫结果' col_name = ['月亮谷基因编号','月亮谷转录本编号','匹配的UniProt编号','蛋白名称','物种','基因','功能','GO ID','GO Description','Link'] sheet.append(col_name) res_new = [acuce_gene_id,acuce_prot_id,match_uniprot_id,protein,organism,gene,function,go_item_id,go_item,link] sheet.append(res_new) file_name = os.getcwd() + '/results/' + 'GO_Biological_process_' + acuce_prot_id + '_' + match_uniprot_id + '_' +go_item_id.replace(':','_') + '.xlsx' res_excel.save(file_name)
|