Add excludes option

This commit is contained in:
dave 2019-03-06 19:30:14 -08:00
parent 2b2dcb50a3
commit b28671ce89
3 changed files with 31 additions and 10 deletions

View File

@ -4,7 +4,7 @@ import os
import argparse import argparse
import asyncio import asyncio
import logging import logging
import traceback from fnmatch import fnmatch
from time import sleep from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin, unquote, urlsplit, urlunsplit from urllib.parse import urljoin, unquote, urlsplit, urlunsplit
@ -14,7 +14,7 @@ from concurrent.futures import ThreadPoolExecutor
from contextlib import closing from contextlib import closing
ScrapeConfig = namedtuple("ScrapeConfig", "output loop executor base_url visited futures semaphore delay") ScrapeConfig = namedtuple("ScrapeConfig", "output loop executor base_url visited futures semaphore delay exclude")
""" """
output: base dest dir to put downloaded files output: base dest dir to put downloaded files
loop: asyncio loop object loop: asyncio loop object
@ -58,14 +58,20 @@ def get_links(content):
def stream_to_file(response, url, options): def stream_to_file(response, url, options):
url_suffix = url[len(options.base_url):] url_suffix = unquote(url[len(options.base_url):])
local_path = os.path.normpath(os.path.join(options.output, unquote(url_suffix))) local_path = os.path.normpath(os.path.join(options.output, url_suffix))
if not local_path.startswith(options.output): if not local_path.startswith(options.output):
raise Exception("Aborted: directory traversal detected!") raise Exception("Aborted: directory traversal detected!")
os.makedirs(os.path.dirname(local_path), exist_ok=True)
try: try:
for pattern in options.exclude:
if fnmatch(url_suffix, pattern):
logging.info("Excluded: '%s' on pattern '%s'", url_suffix, pattern)
raise AlreadyDownloadedException("Excluded")
os.makedirs(os.path.dirname(local_path), exist_ok=True)
if os.path.exists(local_path): if os.path.exists(local_path):
response.close() response.close()
# Local file exists, restart request with range # Local file exists, restart request with range
@ -126,7 +132,7 @@ async def scrape_url(url, options, skip=False):
else: else:
# Actual file, download it # Actual file, download it
# await download_file(g, url, options) # await download_file(g, url, options)
options.semaphore.acquire() await options.semaphore.acquire()
options.futures.append((options.executor.submit(stream_to_file, g, url, options), url, )) options.futures.append((options.executor.submit(stream_to_file, g, url, options), url, ))
# Purge completed futures # Purge completed futures
for item in options.futures[:]: for item in options.futures[:]:
@ -150,14 +156,21 @@ def main():
parser.add_argument('-p', '--parallel', type=int, default=5, help="number of downloads to execute in parallel") parser.add_argument('-p', '--parallel', type=int, default=5, help="number of downloads to execute in parallel")
parser.add_argument('-c', '--clobber', action="store_true", help="clobber existing files instead of resuming") parser.add_argument('-c', '--clobber', action="store_true", help="clobber existing files instead of resuming")
parser.add_argument('-d', '--delay', type=int, default=0, help="delay between requests") parser.add_argument('-d', '--delay', type=int, default=0, help="delay between requests")
parser.add_argument('-e', '--exclude', default=[], nargs="+", help="exclude patterns")
parser.add_argument('-f', '--exclude-from', help="exclude patterns from file")
parser.add_argument('-v', '--verbose', action="store_true", help="enable info logging") parser.add_argument('-v', '--verbose', action="store_true", help="enable info logging")
args = parser.parse_args() args = parser.parse_args()
logging.basicConfig(level=logging.INFO if args.verbose else logging.WARNING, logging.basicConfig(level=logging.DEBUG if args.verbose else logging.WARNING,
format="%(asctime)-15s %(levelname)-8s %(filename)s:%(lineno)d %(message)s") format="%(asctime)-15s %(levelname)-8s %(filename)s:%(lineno)d %(message)s")
logging.debug("cli args: %s", args) logging.debug("cli args: %s", args)
excludes = list(args.exclude)
if args.exclude_from:
with open(args.exclude_from) as f:
excludes += [l.strip() for l in f.readlines() if l.strip()]
with ThreadPoolExecutor(max_workers=args.parallel) as executor: with ThreadPoolExecutor(max_workers=args.parallel) as executor:
with closing(asyncio.get_event_loop()) as loop: with closing(asyncio.get_event_loop()) as loop:
loop.set_debug(True) loop.set_debug(True)
@ -172,7 +185,8 @@ def main():
[], # visited urls [], # visited urls
[], # futures [], # futures
asyncio.Semaphore(value=args.parallel), asyncio.Semaphore(value=args.parallel),
args.delay) args.delay,
excludes)
downloader = asyncio.ensure_future(scrape_url(base_url, config), loop=loop) downloader = asyncio.ensure_future(scrape_url(base_url, config), loop=loop)
try: try:

View File

@ -1,2 +1,3 @@
beautifulsoup4==4.5.3 beautifulsoup4==4.5.3
bs4==0.0.1
requests==2.13.0 requests==2.13.0

View File

@ -1,10 +1,15 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from setuptools import setup from setuptools import setup
with open("requirements.txt") as f:
requirements = f.readlines()
setup(name='pyods', setup(name='pyods',
version='0.0.1', version='0.0.2',
description='Open Directory Scraper', description='Open Directory Scraper',
url='http://gitlab.xmopx.net/dave/pyods', url='https://git.davepedu.com/dave/pyods',
author='dpedu', author='dpedu',
author_email='dave@davepedu.com', author_email='dave@davepedu.com',
packages=['pyods'], packages=['pyods'],
@ -13,4 +18,5 @@ setup(name='pyods',
'pyods = pyods.cli:main' 'pyods = pyods.cli:main'
] ]
}, },
install_requires=requirements,
zip_safe=False) zip_safe=False)