scripts/python/web/subscomru_scraper.py

#!/usr/bin/env python3

import requests
import bs4

def download_file(url):
	filename = url[url.rfind('/') + 1:]

	print('Downloading %s' % filename)

	file = requests.get(url)

	try:
		file.raise_for_status()
	except:
		open(filename + '.failed', 'w')

	with open(filename, 'wb') as f:
		for chunk in file.iter_content(100000):
			f.write(chunk)

		f.close()

def get_file_name(url):
	page = requests.get(url)
	page.raise_for_status()

	soup = bs4.BeautifulSoup(page.text, "html.parser")

	cells = soup.select('td.even') # gay retardness
	for cell in cells:
		text = cell.getText()

		if '.rar' in text or '.zip' in text or '.7z' in text:
			return text


def scrape_site(url):
	# split the url to use later for constructing new urls
	base_url = url[:url.rfind('/') + 1]
	url = url[url.rfind('/') + 1:]

	while True:
		print('Getting %s' % url)

		page = requests.get(base_url + url)
		page.raise_for_status() # throw on fail

		soup = bs4.BeautifulSoup(page.text, "html.parser")

		titles = soup.select('a[title]')
		for title in titles:
			link = title.attrs['href']

			if 'id' in link and not 'dl' in link: # find content links
				print('Found %s' % title.attrs['title'])
				download_file(base_url + 'sub/enganime/' + get_file_name(base_url + link))

		next_link = soup.select('span.pagenav_next > a')
		if len(next_link) == 0:
			print('End of site')
			break

		url = next_link[0].attrs['href']

scrape_site('http://subs.com.ru/list.php?c=enganime&p=5&w=asc&d=1')