scripts/python/web/sheethost_scraper.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests
import bs4
import sys

if len(sys.argv) < 4:
	print('Usage: ' + sys.argv[0] + ' [login] [password] [page name]')
	exit(1)

login = sys.argv[1]
password = sys.argv[2]
page_name = sys.argv[3]

def download_sheet(s, url):
	page = s.get(url)

	try:
		page.raise_for_status()
	except:
		print("Couldn't get %s" % url)
		return

	soup = bs4.BeautifulSoup(page.text, 'html.parser')

	links = soup.select('a')
	for link in links:
		if '.pdf' in link.text:
			with open(link.text[1:link.text.find('.pdf') + 4], 'wb') as f:
				file = s.get(link.attrs['href'])

				try:
					page.raise_for_status()
				except:
					print("Couldn't get %s" % link.text)
					return

				for chunk in file.iter_content(100000):
					f.write(chunk)

with requests.session() as s:
	login = s.post('https://hi10anime.com/wp-login.php', { 'login':login, 'password':password })
	login.raise_for_status()

	if not 'You have successfully logged in. Welcome back!' in login.text:
		print("Couldn't log in")
		exit(1)

	page = s.get('https://sheet.host/user/%s/sheets' % page_name)
	page.raise_for_status()

	soup = bs4.BeautifulSoup(page.text, 'html.parser')

	titles = soup.select('.score-title')

	for title in titles:
		print('Getting %s' % title.text)
		download_sheet(s, title.attrs['href'])