Files
scripts/python/web/subscomru_scraper.py
2019-04-13 22:28:43 +03:00

67 lines
1.5 KiB
Python
Executable File

#!/usr/bin/env python3
import requests
import bs4
def download_file(url):
filename = url[url.rfind('/') + 1:]
print('Downloading %s' % filename)
file = requests.get(url)
try:
file.raise_for_status()
except:
open(filename + '.failed', 'w')
with open(filename, 'wb') as f:
for chunk in file.iter_content(100000):
f.write(chunk)
f.close()
def get_file_name(url):
page = requests.get(url)
page.raise_for_status()
soup = bs4.BeautifulSoup(page.text, "html.parser")
cells = soup.select('td.even') # gay retardness
for cell in cells:
text = cell.getText()
if '.rar' in text or '.zip' in text or '.7z' in text:
return text
def scrape_site(url):
# split the url to use later for constructing new urls
base_url = url[:url.rfind('/') + 1]
url = url[url.rfind('/') + 1:]
while True:
print('Getting %s' % url)
page = requests.get(base_url + url)
page.raise_for_status() # throw on fail
soup = bs4.BeautifulSoup(page.text, "html.parser")
titles = soup.select('a[title]')
for title in titles:
link = title.attrs['href']
if 'id' in link and not 'dl' in link: # find content links
print('Found %s' % title.attrs['title'])
download_file(base_url + 'sub/enganime/' + get_file_name(base_url + link))
next_link = soup.select('span.pagenav_next > a')
if len(next_link) == 0:
print('End of site')
break
url = next_link[0].attrs['href']
scrape_site('http://subs.com.ru/list.php?c=enganime&p=5&w=asc&d=1')