Initial demo

This commit is contained in:
dave 2017-04-25 22:01:59 -07:00
commit 006dad5ace
2 changed files with 151 additions and 0 deletions

135
pyods/cli.py Normal file
View File

@ -0,0 +1,135 @@
#!/usr/bin/env python3.5
import os
import argparse
import asyncio
import logging
from bs4 import BeautifulSoup
from urllib.parse import urljoin, unquote, urlsplit, urlunsplit
from collections import namedtuple
from requests import Session
from concurrent.futures import ThreadPoolExecutor
from contextlib import closing
ScrapeConfig = namedtuple("ScrapeConfig", "output loop executor base_url visited semaphore")
"""
output: base dest dir to put downloaded files
loop: asyncio loop object
executor: large file download threadpool executor
base_url: remote open dir base url to scan
visited: list of urls already visiting/visited
"""
http = Session()
def clean_url(url):
"""
Run a url through urlsplit and urljoin, ensuring uniform url format
"""
return urlunsplit(urlsplit(url))
def stream_url(url):
"""
Return a request's Response object for the given URL
"""
return http.get(url, stream=True)
def get_links(content):
"""
Parse and return hrefs for all links on a given page
:param content: html body to scan
"""
doc = BeautifulSoup(content, "html.parser")
for link in doc.find_all("a"):
href = link.attrs.get('href', None)
if href:
yield href
def stream_to_file(response, url, semaphore):
print("STREAMING {}".format(url))
# Stream to disk
try:
with open("/dev/null", "wb") as f:
for chunk in response.iter_content(chunk_size=256 * 1024):
f.write(chunk)
finally:
semaphore.release()
response.close()
async def scrape_url(url, options):
"""
- Request the URL
- If HTML, parse and recurse
- If other, download
"""
options.visited.append(url)
g = await options.loop.run_in_executor(None, stream_url, url)
if g.status_code != 200:
logging.warning("Fetch failed, code was %s", g.status_code)
return
content_type = g.headers.get("Content-Type", "")
if "text/html" in content_type:
# HTML page, parse it
for link in get_links(g.text):
link_dest = clean_url(urljoin(url, link))
if not link_dest.startswith(options.base_url):
# Link leads outside the base_url, skip it
continue
if link_dest not in options.visited:
# link is valid and not seen before, visit it
logging.info("Visiting %s", url)
await scrape_url(link_dest, options)
g.close()
else:
# Actual file, download it
# await download_file(g, url, options)
options.semaphore.acquire()
options.executor.submit(stream_to_file, g, url, options.semaphore)
def main():
logging.basicConfig(level=logging.INFO,
format="%(asctime)-15s %(levelname)-8s %(filename)s:%(lineno)d %(message)s")
parser = argparse.ArgumentParser(description="Open directory scraper")
parser.add_argument('-u', '--url', help="url to scrape")
parser.add_argument('-o', '--output-dir', help="destination for downloaded files")
parser.add_argument('-p', '--parallel', type=int, default=5, help="number of downloads to execute in parallel")
args = parser.parse_args()
logging.info("cli args: %s", args)
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
with closing(asyncio.get_event_loop()) as loop:
loop.set_debug(True)
loop.set_default_executor(executor)
base_url = clean_url(args.url)
config = ScrapeConfig(os.path.normpath(args.output_dir),
loop,
executor,
base_url,
[],
asyncio.Semaphore(value=args.parallel))
downloader = asyncio.ensure_future(scrape_url(base_url, config), loop=loop)
try:
loop.set_debug(True)
loop.run_until_complete(downloader)
finally:
logging.debug("Escaped main loop")
if __name__ == '__main__':
main()

16
setup.py Normal file
View File

@ -0,0 +1,16 @@
#!/usr/bin/env python3
from setuptools import setup
setup(name='pyods',
version='0.0.1',
description='Open Directory Scraper',
url='http://gitlab.xmopx.net/dave/pyods',
author='dpedu',
author_email='dave@davepedu.com',
packages=['pyods'],
entry_points={
'console_scripts': [
'pyods = pyods.cli:main'
]
},
zip_safe=False)