Files
scripts/python/web/sheethost_scraper.py

60 lines
1.3 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import bs4
import sys
if len(sys.argv) < 4:
print('Usage: ' + sys.argv[0] + ' [login] [password] [page name]')
exit(1)
login = sys.argv[1]
password = sys.argv[2]
page_name = sys.argv[3]
def download_sheet(s, url):
page = s.get(url)
try:
page.raise_for_status()
except:
print("Couldn't get %s" % url)
return
soup = bs4.BeautifulSoup(page.text, 'html.parser')
links = soup.select('a')
for link in links:
if '.pdf' in link.text:
with open(link.text[1:link.text.find('.pdf') + 4], 'wb') as f:
file = s.get(link.attrs['href'])
try:
page.raise_for_status()
except:
print("Couldn't get %s" % link.text)
return
for chunk in file.iter_content(100000):
f.write(chunk)
with requests.session() as s:
login = s.post('https://hi10anime.com/wp-login.php', { 'login':login, 'password':password })
login.raise_for_status()
if not 'You have successfully logged in. Welcome back!' in login.text:
print("Couldn't log in")
exit(1)
page = s.get('https://sheet.host/user/%s/sheets' % page_name)
page.raise_for_status()
soup = bs4.BeautifulSoup(page.text, 'html.parser')
titles = soup.select('.score-title')
for title in titles:
print('Getting %s' % title.text)
download_sheet(s, title.attrs['href'])