1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
#
# Copyright (c) 2016, Alliance for Open Media. All rights reserved
#
# This source code is subject to the terms of the BSD 2 Clause License and
# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
# was not distributed with this source code in the LICENSE file, you can
# obtain it at www.aomedia.org/license/software. If the Alliance for Open
# Media Patent License 1.0 was not distributed with this source code in the
# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
#
# This simple script pulls test files from the webm homepage
# It is intelligent enough to only pull files if
# 1) File / test_data folder does not exist
# 2) SHA mismatch
import pycurl
import csv
import hashlib
import re
import os.path
import time
import itertools
import sys
import getopt
#globals
url = ''
file_list_path = ''
local_resource_path = ''
# Helper functions:
# A simple function which returns the sha hash of a file in hex
def get_file_sha(filename):
try:
sha_hash = hashlib.sha1()
with open(filename, 'rb') as file:
buf = file.read(HASH_CHUNK)
while len(buf) > 0:
sha_hash.update(buf)
buf = file.read(HASH_CHUNK)
return sha_hash.hexdigest()
except IOError:
print "Error reading " + filename
# Downloads a file from a url, and then checks the sha against the passed
# in sha
def download_and_check_sha(url, filename, sha):
path = os.path.join(local_resource_path, filename)
fp = open(path, "wb")
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url + "/" + filename)
curl.setopt(pycurl.WRITEDATA, fp)
curl.perform()
curl.close()
fp.close()
return get_file_sha(path) == sha
#constants
ftp_retries = 3
SHA_COL = 0
NAME_COL = 1
EXPECTED_COL = 2
HASH_CHUNK = 65536
# Main script
try:
opts, args = \
getopt.getopt(sys.argv[1:], \
"u:i:o:", ["url=", "input_csv=", "output_dir="])
except:
print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
sys.exit(2)
for opt, arg in opts:
if opt == '-u':
url = arg
elif opt in ("-i", "--input_csv"):
file_list_path = os.path.join(arg)
elif opt in ("-o", "--output_dir"):
local_resource_path = os.path.join(arg)
if len(sys.argv) != 7:
print "Expects two paths and a url!"
exit(1)
if not os.path.isdir(local_resource_path):
os.makedirs(local_resource_path)
file_list_csv = open(file_list_path, "rb")
# Our 'csv' file uses multiple spaces as a delimiter, python's
# csv class only uses single character delimiters, so we convert them below
file_list_reader = csv.reader((re.sub(' +', ' ', line) \
for line in file_list_csv), delimiter = ' ')
file_shas = []
file_names = []
for row in file_list_reader:
if len(row) != EXPECTED_COL:
continue
file_shas.append(row[SHA_COL])
file_names.append(row[NAME_COL])
file_list_csv.close()
# Download files, only if they don't already exist and have correct shas
for filename, sha in itertools.izip(file_names, file_shas):
path = os.path.join(local_resource_path, filename)
if os.path.isfile(path) \
and get_file_sha(path) == sha:
print path + ' exists, skipping'
continue
for retry in range(0, ftp_retries):
print "Downloading " + path
if not download_and_check_sha(url, filename, sha):
print "Sha does not match, retrying..."
else:
break
|