Add excludes option
This commit is contained in:
parent
2b2dcb50a3
commit
b28671ce89
30
pyods/cli.py
30
pyods/cli.py
|
@ -4,7 +4,7 @@ import os
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import traceback
|
from fnmatch import fnmatch
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urljoin, unquote, urlsplit, urlunsplit
|
from urllib.parse import urljoin, unquote, urlsplit, urlunsplit
|
||||||
|
@ -14,7 +14,7 @@ from concurrent.futures import ThreadPoolExecutor
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
|
|
||||||
ScrapeConfig = namedtuple("ScrapeConfig", "output loop executor base_url visited futures semaphore delay")
|
ScrapeConfig = namedtuple("ScrapeConfig", "output loop executor base_url visited futures semaphore delay exclude")
|
||||||
"""
|
"""
|
||||||
output: base dest dir to put downloaded files
|
output: base dest dir to put downloaded files
|
||||||
loop: asyncio loop object
|
loop: asyncio loop object
|
||||||
|
@ -58,14 +58,20 @@ def get_links(content):
|
||||||
|
|
||||||
|
|
||||||
def stream_to_file(response, url, options):
|
def stream_to_file(response, url, options):
|
||||||
url_suffix = url[len(options.base_url):]
|
url_suffix = unquote(url[len(options.base_url):])
|
||||||
local_path = os.path.normpath(os.path.join(options.output, unquote(url_suffix)))
|
local_path = os.path.normpath(os.path.join(options.output, url_suffix))
|
||||||
|
|
||||||
if not local_path.startswith(options.output):
|
if not local_path.startswith(options.output):
|
||||||
raise Exception("Aborted: directory traversal detected!")
|
raise Exception("Aborted: directory traversal detected!")
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
||||||
try:
|
try:
|
||||||
|
for pattern in options.exclude:
|
||||||
|
if fnmatch(url_suffix, pattern):
|
||||||
|
logging.info("Excluded: '%s' on pattern '%s'", url_suffix, pattern)
|
||||||
|
raise AlreadyDownloadedException("Excluded")
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
||||||
|
|
||||||
if os.path.exists(local_path):
|
if os.path.exists(local_path):
|
||||||
response.close()
|
response.close()
|
||||||
# Local file exists, restart request with range
|
# Local file exists, restart request with range
|
||||||
|
@ -126,7 +132,7 @@ async def scrape_url(url, options, skip=False):
|
||||||
else:
|
else:
|
||||||
# Actual file, download it
|
# Actual file, download it
|
||||||
# await download_file(g, url, options)
|
# await download_file(g, url, options)
|
||||||
options.semaphore.acquire()
|
await options.semaphore.acquire()
|
||||||
options.futures.append((options.executor.submit(stream_to_file, g, url, options), url, ))
|
options.futures.append((options.executor.submit(stream_to_file, g, url, options), url, ))
|
||||||
# Purge completed futures
|
# Purge completed futures
|
||||||
for item in options.futures[:]:
|
for item in options.futures[:]:
|
||||||
|
@ -150,14 +156,21 @@ def main():
|
||||||
parser.add_argument('-p', '--parallel', type=int, default=5, help="number of downloads to execute in parallel")
|
parser.add_argument('-p', '--parallel', type=int, default=5, help="number of downloads to execute in parallel")
|
||||||
parser.add_argument('-c', '--clobber', action="store_true", help="clobber existing files instead of resuming")
|
parser.add_argument('-c', '--clobber', action="store_true", help="clobber existing files instead of resuming")
|
||||||
parser.add_argument('-d', '--delay', type=int, default=0, help="delay between requests")
|
parser.add_argument('-d', '--delay', type=int, default=0, help="delay between requests")
|
||||||
|
parser.add_argument('-e', '--exclude', default=[], nargs="+", help="exclude patterns")
|
||||||
|
parser.add_argument('-f', '--exclude-from', help="exclude patterns from file")
|
||||||
parser.add_argument('-v', '--verbose', action="store_true", help="enable info logging")
|
parser.add_argument('-v', '--verbose', action="store_true", help="enable info logging")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO if args.verbose else logging.WARNING,
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.WARNING,
|
||||||
format="%(asctime)-15s %(levelname)-8s %(filename)s:%(lineno)d %(message)s")
|
format="%(asctime)-15s %(levelname)-8s %(filename)s:%(lineno)d %(message)s")
|
||||||
|
|
||||||
logging.debug("cli args: %s", args)
|
logging.debug("cli args: %s", args)
|
||||||
|
|
||||||
|
excludes = list(args.exclude)
|
||||||
|
if args.exclude_from:
|
||||||
|
with open(args.exclude_from) as f:
|
||||||
|
excludes += [l.strip() for l in f.readlines() if l.strip()]
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
||||||
with closing(asyncio.get_event_loop()) as loop:
|
with closing(asyncio.get_event_loop()) as loop:
|
||||||
loop.set_debug(True)
|
loop.set_debug(True)
|
||||||
|
@ -172,7 +185,8 @@ def main():
|
||||||
[], # visited urls
|
[], # visited urls
|
||||||
[], # futures
|
[], # futures
|
||||||
asyncio.Semaphore(value=args.parallel),
|
asyncio.Semaphore(value=args.parallel),
|
||||||
args.delay)
|
args.delay,
|
||||||
|
excludes)
|
||||||
|
|
||||||
downloader = asyncio.ensure_future(scrape_url(base_url, config), loop=loop)
|
downloader = asyncio.ensure_future(scrape_url(base_url, config), loop=loop)
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
beautifulsoup4==4.5.3
|
beautifulsoup4==4.5.3
|
||||||
|
bs4==0.0.1
|
||||||
requests==2.13.0
|
requests==2.13.0
|
||||||
|
|
10
setup.py
10
setup.py
|
@ -1,10 +1,15 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
|
||||||
|
|
||||||
|
with open("requirements.txt") as f:
|
||||||
|
requirements = f.readlines()
|
||||||
|
|
||||||
|
|
||||||
setup(name='pyods',
|
setup(name='pyods',
|
||||||
version='0.0.1',
|
version='0.0.2',
|
||||||
description='Open Directory Scraper',
|
description='Open Directory Scraper',
|
||||||
url='http://gitlab.xmopx.net/dave/pyods',
|
url='https://git.davepedu.com/dave/pyods',
|
||||||
author='dpedu',
|
author='dpedu',
|
||||||
author_email='dave@davepedu.com',
|
author_email='dave@davepedu.com',
|
||||||
packages=['pyods'],
|
packages=['pyods'],
|
||||||
|
@ -13,4 +18,5 @@ setup(name='pyods',
|
||||||
'pyods = pyods.cli:main'
|
'pyods = pyods.cli:main'
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
install_requires=requirements,
|
||||||
zip_safe=False)
|
zip_safe=False)
|
||||||
|
|
Loading…
Reference in New Issue