#!/usr/bin/env python3 # -*- coding: utf-8 -*- import requests import bs4 def download_file(url): filename = url[url.rfind('/') + 1:] print('Downloading %s' % filename) file = requests.get(url) try: file.raise_for_status() except: open(filename + '.failed', 'w') with open(filename, 'wb') as f: for chunk in file.iter_content(100000): f.write(chunk) f.close() def get_file_name(url): page = requests.get(url) page.raise_for_status() soup = bs4.BeautifulSoup(page.text, "html.parser") cells = soup.select('td.even') # gay retardness for cell in cells: text = cell.getText() if '.rar' in text or '.zip' in text or '.7z' in text: return text def scrape_site(url): # split the url to use later for constructing new urls base_url = url[:url.rfind('/') + 1] url = url[url.rfind('/') + 1:] while True: print('Getting %s' % url) page = requests.get(base_url + url) page.raise_for_status() # throw on fail soup = bs4.BeautifulSoup(page.text, "html.parser") titles = soup.select('a[title]') for title in titles: link = title.attrs['href'] if 'id' in link and not 'dl' in link: # find content links print('Found %s' % title.attrs['title']) download_file(base_url + 'sub/enganime/' + get_file_name(base_url + link)) next_link = soup.select('span.pagenav_next > a') if len(next_link) == 0: print('End of site') break url = next_link[0].attrs['href'] scrape_site('http://subs.com.ru/list.php?c=enganime&p=5&w=asc&d=1')