68 lines
1.5 KiB
Python
Executable File
68 lines
1.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import requests
|
|
import bs4
|
|
|
|
def download_file(url):
|
|
filename = url[url.rfind('/') + 1:]
|
|
|
|
print('Downloading %s' % filename)
|
|
|
|
file = requests.get(url)
|
|
|
|
try:
|
|
file.raise_for_status()
|
|
except:
|
|
open(filename + '.failed', 'w')
|
|
|
|
with open(filename, 'wb') as f:
|
|
for chunk in file.iter_content(100000):
|
|
f.write(chunk)
|
|
|
|
f.close()
|
|
|
|
def get_file_name(url):
|
|
page = requests.get(url)
|
|
page.raise_for_status()
|
|
|
|
soup = bs4.BeautifulSoup(page.text, "html.parser")
|
|
|
|
cells = soup.select('td.even') # gay retardness
|
|
for cell in cells:
|
|
text = cell.getText()
|
|
|
|
if '.rar' in text or '.zip' in text or '.7z' in text:
|
|
return text
|
|
|
|
|
|
def scrape_site(url):
|
|
# split the url to use later for constructing new urls
|
|
base_url = url[:url.rfind('/') + 1]
|
|
url = url[url.rfind('/') + 1:]
|
|
|
|
while True:
|
|
print('Getting %s' % url)
|
|
|
|
page = requests.get(base_url + url)
|
|
page.raise_for_status() # throw on fail
|
|
|
|
soup = bs4.BeautifulSoup(page.text, "html.parser")
|
|
|
|
titles = soup.select('a[title]')
|
|
for title in titles:
|
|
link = title.attrs['href']
|
|
|
|
if 'id' in link and not 'dl' in link: # find content links
|
|
print('Found %s' % title.attrs['title'])
|
|
download_file(base_url + 'sub/enganime/' + get_file_name(base_url + link))
|
|
|
|
next_link = soup.select('span.pagenav_next > a')
|
|
if len(next_link) == 0:
|
|
print('End of site')
|
|
break
|
|
|
|
url = next_link[0].attrs['href']
|
|
|
|
scrape_site('http://subs.com.ru/list.php?c=enganime&p=5&w=asc&d=1')
|