|
|
|
@ -1,3 +1,4 @@
|
|
|
|
|
import os |
|
|
|
|
import sys |
|
|
|
|
import re |
|
|
|
|
import hashlib |
|
|
|
@ -13,8 +14,8 @@ mirrors = ['https://dumps.wikimedia.org',
|
|
|
|
|
'https://ftp.acc.umu.se/mirror/wikimedia.org/dumps', |
|
|
|
|
'https://dumps.wikimedia.your.org'] |
|
|
|
|
trackers = ['udp://tracker.opentrackr.org:1337/announce', |
|
|
|
|
'udp://tracker.coppersurfer.tk:6969', |
|
|
|
|
'udp://tracker.leechers-paradise.org:6969'] |
|
|
|
|
'udp://tracker.openbittorrent.com:6969', |
|
|
|
|
'udp://explodie.org:6969'] |
|
|
|
|
prev_progress = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -47,19 +48,29 @@ def main():
|
|
|
|
|
if recombine['status'] != 'done': |
|
|
|
|
print('"articles dump recombine" job is not done yet!') |
|
|
|
|
sys.exit(-1) |
|
|
|
|
expected_file_name = f'{db}-{date}-pages-articles-multistream.xml.bz2' |
|
|
|
|
file = recombine['files'][expected_file_name] |
|
|
|
|
path = file['url'] |
|
|
|
|
download_file(expected_file_name, f'{mirror}{path}', file['size']) |
|
|
|
|
verify_checksum(expected_file_name, file['sha1'], file['size']) |
|
|
|
|
webseeds = [ |
|
|
|
|
f'{mirrors[0]}{path}', |
|
|
|
|
f'{mirrors[1]}{path}', |
|
|
|
|
] |
|
|
|
|
torrent = Torrent(path=expected_file_name, webseeds=webseeds, trackers=trackers) |
|
|
|
|
dir_name = f'{db}-{date}-pages-articles-multistream' |
|
|
|
|
try: |
|
|
|
|
os.mkdir(dir_name) |
|
|
|
|
except OSError as error: |
|
|
|
|
print(error) |
|
|
|
|
expected_file_name1 = f'{db}-{date}-pages-articles-multistream-index.txt.bz2' |
|
|
|
|
file1 = recombine['files'][expected_file_name1] |
|
|
|
|
path1 = file1['url'] |
|
|
|
|
download_file(f'{dir_name}/{expected_file_name1}', f'{mirror}{path1}', file1['size']) |
|
|
|
|
verify_checksum(f'{dir_name}/{expected_file_name1}', file1['sha1'], file1['size']) |
|
|
|
|
expected_file_name2 = f'{db}-{date}-pages-articles-multistream.xml.bz2' |
|
|
|
|
file2 = recombine['files'][expected_file_name2] |
|
|
|
|
path2 = file2['url'] |
|
|
|
|
download_file(f'{dir_name}/{expected_file_name2}', f'{mirror}{path2}', file2['size']) |
|
|
|
|
verify_checksum(f'{dir_name}/{expected_file_name2}', file2['sha1'], file2['size']) |
|
|
|
|
# webseeds = [ |
|
|
|
|
# f'{mirrors[0]}{path}', |
|
|
|
|
# f'{mirrors[1]}{path}', |
|
|
|
|
# ] |
|
|
|
|
torrent = Torrent(path=dir_name, trackers=trackers) |
|
|
|
|
print('Creating torrent file...', end='') |
|
|
|
|
torrent.generate() |
|
|
|
|
torrent.write(f'{expected_file_name}.torrent') |
|
|
|
|
torrent.write(f'{dir_name}.torrent') |
|
|
|
|
print('Done') |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -76,7 +87,7 @@ def date_format(date):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def verify_checksum(filename, expected, size): |
|
|
|
|
print(f'Checking checksum of {filename} ', end='') |
|
|
|
|
print(f'Checking checksum of {filename} ', end='', flush=True) |
|
|
|
|
show_progress(0) |
|
|
|
|
h = hashlib.sha1() |
|
|
|
|
one_percent = size / 100 |
|
|
|
@ -91,9 +102,9 @@ def verify_checksum(filename, expected, size):
|
|
|
|
|
show_progress(int(done * 100 / size)) |
|
|
|
|
rc = h.hexdigest() |
|
|
|
|
if rc != expected: |
|
|
|
|
print(' Bad') |
|
|
|
|
print(' Bad', flush=True) |
|
|
|
|
sys.exit(-1) |
|
|
|
|
print(' Ok') |
|
|
|
|
print(' Ok', flush=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_job_state(base_uri, db, date, jobname): |
|
|
|
@ -113,7 +124,7 @@ def download_file(filename, uri, size):
|
|
|
|
|
headers = {'Range': f'bytes={start_from}-'} |
|
|
|
|
else: |
|
|
|
|
headers = {} |
|
|
|
|
print(f'Downloading {uri} ', end='') |
|
|
|
|
print(f'Downloading {uri} ', end='', flush=True) |
|
|
|
|
one_percent = size / 100 |
|
|
|
|
next_percent = start_from + one_percent |
|
|
|
|
show_progress(0) |
|
|
|
@ -129,17 +140,17 @@ def download_file(filename, uri, size):
|
|
|
|
|
if done >= next_percent: |
|
|
|
|
next_percent += one_percent |
|
|
|
|
show_progress(int(done * 100 / size)) |
|
|
|
|
print(" Done") |
|
|
|
|
print(" Done", flush=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def show_progress(percent: int): |
|
|
|
|
is_tty = sys.stdout.isatty() |
|
|
|
|
if percent == 0 and is_tty: |
|
|
|
|
print(' 0%', end='') |
|
|
|
|
print(' 0%', end='', flush=True) |
|
|
|
|
elif is_tty: |
|
|
|
|
print(f"\033[4D{percent:3d}%", end='') |
|
|
|
|
print(f"\033[4D{percent:3d}%", end='', flush=True) |
|
|
|
|
elif percent != 0: |
|
|
|
|
print('.', end='') |
|
|
|
|
print('.', end='', flush=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_last_date(base_uri, db): |
|
|
|
|