问题:重复下载 解决:加入对 已下载文件及其大小的判断,跳过完整下载的EOF 依然存在的问题:下载速度特别慢
# -*- coding:utf-8 -*- # Author:PasserQi # Time:2019-4-5 # 下载文件夹下哨兵数据的精轨数据 # 须知:文件夹下的哨兵数据需解压。不想解压可以修改程序的第43行,.SAFE该为.zip import urllib from bs4 import BeautifulSoup import re import os import datetime import time from urllib.request import Request, urlopen # 需要修改的参数 dir_path = r'/media/lll/My Passport/1.s1_data' # 哨兵数据存在的目录(linux) # dir_path = r'D:\1.s1_data' # 哨兵数据存在的目录(win10) out_path = r'/media/lll/My Passport/2.s1_precision_oribit_data' #精轨数据保存的目录(linux) # out_path = r'D:\2.s1_precision_oribit_data' # 精轨数据保存的目录(win10) FILE_TYPE = ".zip" #文件格式:.SAFE .zip IsDownload = True #是否下载:True False download_urls = [] error_url = [] url_prefix = 'https://qc.sentinel1.eo.esa.int/aux_poeorb/' #下载地址 def download(dest_dir, url): print("downloading from:{}\n\t to {}\n".format(url, dest_dir)) try: urllib.request.urlretrieve(url, dest_dir, callbackfunc) except: error_url.append(url) print ('\tError retrieving the URL:', dest_dir) else: # 没有异常 print ("\t[done]") if url in error_url: #在错误列表里 error_url.remove(url) def callbackfunc(blocknum, blocksize, totalsize): '''回调函数 @blocknum: 已经下载的数据块 @blocksize: 数据块的大小 @totalsize: 远程文件的大小 ''' percent = 100.0 * blocknum * blocksize / totalsize if percent > 100: percent = 100 sys.stdout.write(" > Downloaded %d of %d bytes (%0.2f%%)\r" %(blocknum * blocksize, totalsize, percent)) def get_yestoday(mytime): myday = datetime.datetime( int(mytime[0:4]),int(mytime[4:6]),int(mytime[6:8]) ) delta = datetime.timedelta(days=-1) my_yestoday = myday + delta my_yes_time = my_yestoday.strftime('%Y%m%d') return my_yes_time def get_total_size(response): try: file_size = response.info().getheader('Content-Length').strip() except AttributeError: try: file_size = response.getheader('Content-Length').strip() except AttributeError: print ("> Problem getting size") return None return int(file_size) if __name__ == '__main__': # # 获得files files = os.listdir(dir_path) for n,file in enumerate(files): if not file.endswith(FILE_TYPE): continue # ########################### # 按文件名上的信息查找EOF # 拼接URL url_param_json = {} url_param_json['sentinel1__mission'] = file[0:3] date = re.findall(r"\d{8}",file)[0] # 若参数为20170316,则搜索的是20170317的数据 # 所以参数应该提前一天 # 求date的前一天 date = get_yestoday(date) # 在字符串指定位置插入指定字符 # 例:20170101 --> 2017-01-01 tmp = list(date) tmp.insert(4,'-');tmp.insert(7,'-') date = "".join(tmp) url_param_json['validity_start'] = date # 获得EOF下载网址 url_param = urllib.parse.urlencode(url_param_json) #url参数 url = 'https://qc.sentinel1.eo.esa.int/aux_poeorb/?%s' % url_param #拼接 # print ("url:{}".format(url)) html = urllib.request.urlopen(url) # 获取html dom = BeautifulSoup(html, 'lxml') # 解析html文档 a_list = dom.findAll("a") # 找出<a> # print(len(a_list))# 21 eof_lists = [a['href'] for a in a_list if a['href'].endswith('.EOF')] # 找出EOF # print(len(eof_lists)) download_urls.append(eof_lists[0]) print(" > Search for %d of %d URL\r" %(n,len(files)),end='') if IsDownload: for eof in download_urls: eof_name = eof.split('/')[-1] #名字 savefile = os.path.join(out_path, eof_name) #保存路径 # see if we've already download this file and if it is that it is the correct size download_file = os.path.basename(eof).split('?')[0] if os.path.isfile(download_file): try: request = Request(eof) request.get_method = lambda : 'HEAD' response = urlopen(request, timeout=30) remote_size = get_total_size(response) # Check that we were able to derive a size. if remote_size: local_size = os.path.getsize(download_file) if remote_size < (local_size+(local_size*.01)) and remote_size > (local_size-(local_size*.01)): print (" > Download file {0} exists! \n > Skipping download of {1}. ".format(download_file, eof)) # return None,None continue #partial file size wasn't full file size, lets blow away the chunk and start again print (" > Found {0} but it wasn't fully downloaded. Removing file and downloading again.".format(download_file)) os.remove(download_file) except ssl.CertificateError as e: print (" > ERROR: {0}".format(e)) print (" > Could not validate SSL Cert. You may be able to overcome this using the --insecure flag") # return False,None continue except HTTPError as e: if e.code == 401: print (" > IMPORTANT: Your user may not have permission to download this type of data!") else: print (" > Unknown Error, Could not get file HEAD: {0}".format(e)) except URLError as e: print ("URL Error (from HEAD): {0}, {1}".format( e.reason, eof)) if "ssl.c" in "{0}".format(e.reason): print ("IMPORTANT: Remote location may not be accepting your SSL configuration. This is a terminal error.") # return False,None continue print (" > !!!!!!!!!!!! \n Download file {0}! \n > Doing download of {1}. ".format(download_file, eof)) download(savefile, eof)