add --delay

This commit is contained in:
dave 2017-07-22 14:33:20 -07:00
parent 682694e7c7
commit 2b2dcb50a3
1 changed files with 26 additions and 16 deletions

View File

@ -5,6 +5,7 @@ import argparse
import asyncio import asyncio
import logging import logging
import traceback import traceback
from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin, unquote, urlsplit, urlunsplit from urllib.parse import urljoin, unquote, urlsplit, urlunsplit
from collections import namedtuple from collections import namedtuple
@ -13,7 +14,7 @@ from concurrent.futures import ThreadPoolExecutor
from contextlib import closing from contextlib import closing
ScrapeConfig = namedtuple("ScrapeConfig", "output loop executor base_url visited futures semaphore") ScrapeConfig = namedtuple("ScrapeConfig", "output loop executor base_url visited futures semaphore delay")
""" """
output: base dest dir to put downloaded files output: base dest dir to put downloaded files
loop: asyncio loop object loop: asyncio loop object
@ -22,6 +23,11 @@ ScrapeConfig = namedtuple("ScrapeConfig", "output loop executor base_url visited
visited: list of urls already visiting/visited visited: list of urls already visiting/visited
""" """
class AlreadyDownloadedException(Exception):
pass
http = Session() http = Session()
@ -58,7 +64,6 @@ def stream_to_file(response, url, options):
if not local_path.startswith(options.output): if not local_path.startswith(options.output):
raise Exception("Aborted: directory traversal detected!") raise Exception("Aborted: directory traversal detected!")
logging.info("Downloading {} to {}".format(url, local_path))
os.makedirs(os.path.dirname(local_path), exist_ok=True) os.makedirs(os.path.dirname(local_path), exist_ok=True)
try: try:
if os.path.exists(local_path): if os.path.exists(local_path):
@ -68,20 +73,20 @@ def stream_to_file(response, url, options):
remote_size = int(response.headers.get("Content-length")) remote_size = int(response.headers.get("Content-length"))
if fsize == remote_size: if fsize == remote_size:
raise Exception("Already downloaded") raise AlreadyDownloadedException("Already downloaded")
logging.info("{} already exists, restarting request with range {}-{}".format(local_path, fsize, logging.info("{} already exists, restarting request with range {}-{}".format(local_path, fsize,
remote_size)) remote_size))
if options.delay:
sleep(options.delay)
logging.warning("Downloading {} to {}".format(url, local_path))
response = stream_url(url, headers={"Range": "bytes={}-{}".format(fsize, remote_size)}) response = stream_url(url, headers={"Range": "bytes={}-{}".format(fsize, remote_size)})
response.raise_for_status() #TODO: clobber file and restart w/ no range header if range not satisfiable response.raise_for_status() # TODO: clobber file and restart w/ no range header if range not satisfiable
with open(local_path, "wb") as f: with open(local_path, "wb") as f:
for chunk in response.iter_content(chunk_size=256 * 1024): for chunk in response.iter_content(chunk_size=256 * 1024):
f.write(chunk) f.write(chunk)
except:
traceback.print_exc()
raise
finally: finally:
options.semaphore.release() options.semaphore.release()
try: try:
@ -102,7 +107,7 @@ async def scrape_url(url, options, skip=False):
g = await options.loop.run_in_executor(None, stream_url, url) g = await options.loop.run_in_executor(None, stream_url, url)
if g.status_code != 200: if g.status_code != 200:
logging.warning("Fetch failed, code was %s", g.status_code) logging.error("Fetch failed, code was %s", g.status_code)
return return
content_type = g.headers.get("Content-Type", "") content_type = g.headers.get("Content-Type", "")
@ -130,24 +135,28 @@ async def scrape_url(url, options, skip=False):
options.futures.remove(item) options.futures.remove(item)
exc = future.exception() exc = future.exception()
if exc: if exc:
logging.error("FAILED: %s: %s", url, exc) if type(exc) is AlreadyDownloadedException:
logging.info("ALREADY COMPLETE: url: %s", url)
else:
logging.error("FAILED: %s: %s", url, exc)
else: else:
logging.info("COMPLETED downloading url %s to %s", *future.result()) logging.warning("COMPLETED downloading url %s to %s", *future.result())
def main(): def main():
logging.basicConfig(level=logging.INFO,
format="%(asctime)-15s %(levelname)-8s %(filename)s:%(lineno)d %(message)s")
parser = argparse.ArgumentParser(description="Open directory scraper") parser = argparse.ArgumentParser(description="Open directory scraper")
parser.add_argument('-u', '--url', help="url to scrape") parser.add_argument('-u', '--url', help="url to scrape")
parser.add_argument('-o', '--output-dir', help="destination for downloaded files") parser.add_argument('-o', '--output-dir', help="destination for downloaded files")
parser.add_argument('-p', '--parallel', type=int, default=5, help="number of downloads to execute in parallel") parser.add_argument('-p', '--parallel', type=int, default=5, help="number of downloads to execute in parallel")
parser.add_argument('-c', '--clobber', action="store_true", help="clobber existing files instead of resuming") parser.add_argument('-c', '--clobber', action="store_true", help="clobber existing files instead of resuming")
parser.add_argument('-d', '--delay', type=int, default=0, help="delay between requests")
parser.add_argument('-v', '--verbose', action="store_true", help="enable info logging")
args = parser.parse_args() args = parser.parse_args()
logging.info("cli args: %s", args) logging.basicConfig(level=logging.INFO if args.verbose else logging.WARNING,
format="%(asctime)-15s %(levelname)-8s %(filename)s:%(lineno)d %(message)s")
logging.debug("cli args: %s", args)
with ThreadPoolExecutor(max_workers=args.parallel) as executor: with ThreadPoolExecutor(max_workers=args.parallel) as executor:
with closing(asyncio.get_event_loop()) as loop: with closing(asyncio.get_event_loop()) as loop:
@ -162,7 +171,8 @@ def main():
base_url, base_url,
[], # visited urls [], # visited urls
[], # futures [], # futures
asyncio.Semaphore(value=args.parallel)) asyncio.Semaphore(value=args.parallel),
args.delay)
downloader = asyncio.ensure_future(scrape_url(base_url, config), loop=loop) downloader = asyncio.ensure_future(scrape_url(base_url, config), loop=loop)
try: try: