Initial

2019-04-13 22:28:43 +03:00
commit cbf6cd3f07
19 changed files with 1066 additions and 0 deletions
--- a/python/web/gogdb_scaper.py
+++ b/python/web/gogdb_scaper.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+import requests # http requests
+import bs4 # html parser
+
+with open("titles.txt", "w", encoding="UTF-8") as file:
+    for index in range(1, 175):
+        url = "https://www.gogdb.org/products?page=" + str(index)
+        print(url)
+
+        page = requests.get("https://www.gogdb.org/products?page=" + str(index))
+        page.raise_for_status()
+
+        soup = bs4.BeautifulSoup(page.text, "html.parser")
+
+        producttable = soup.select("#product-table")[0]
+        titles = producttable.select("tr")
+        for title in titles:
+            if len(title.select(".col-type")) == 0:
+                continue
+
+            if title.select(".col-type")[0].text == 'Game':
+                file.write(title.select(".col-name")[0].text.strip() + '\n')
--- a/python/web/mal_top_fetcher.py
+++ b/python/web/mal_top_fetcher.py
@@ -0,0 +1,34 @@
+import requests, bs4, time
+
+def get_titles(filename, title_type, maxrank):
+    with open(filename, "w", encoding="UTF-8") as file:
+        limit = 0
+        written = 0
+
+        while True:
+            page = requests.get("https://myanimelist.net/topanime.php?type=" + title_type + "&limit=" + str(limit), headers = {'User-agent': 'stopblockingmyscriptlol'})
+            page.raise_for_status()
+
+            soup = bs4.BeautifulSoup(page.text, "html.parser")
+
+            titles = soup.select("a[rel]")
+
+            for title in titles:
+                if len(title.text) == 2 or title.text == "Login":
+                    continue
+
+                file.write(title.text.strip() + '\n')
+                written += 1
+
+                print(str(written), title.text.strip())
+
+                if written >= maxrank:
+                    break
+
+            if written >= maxrank:
+                break
+
+            limit += 50
+
+get_titles("rating.txt", "tv", 1750)
+get_titles("movies.txt", "movie", 300)
--- a/python/web/pwned_checker.py
+++ b/python/web/pwned_checker.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+
+# reads SteamIDs from ./accounts.txt and outputs ban information into ./output.html
+
+import urllib.request
+import json
+import time
+
+steamapikey = ""
+
+# read file and remove trailing newline because we're making a list
+account_lines = [line.rstrip("\n") for line in open("accounts.txt").readlines()]
+
+ids = []
+for line in account_lines:
+	# https://developer.valvesoftware.com/wiki/SteamID
+	Z = int(line.split(':')[2])
+	V = 0x0110000100000000 # profile ID constant
+	Y = int(line.split(':')[1])
+	W = Z * 2 + V + Y
+	ids.append(str(W))
+
+# API takes in comma seperated steamids
+ids_string = ",".join([x for x in ids])
+
+# https://developer.valvesoftware.com/wiki/Steam_Web_API
+summaries = json.load(urllib.request.urlopen("http://api.steampowered.com/ISteamUser/GetPlayerSummaries/v0002/?key=" + steamapikey + "&steamids=" + ids_string))
+bans      = json.load(urllib.request.urlopen("http://api.steampowered.com/ISteamUser/GetPlayerBans/v1/?key="         + steamapikey + "&steamids=" + ids_string))
+
+output_file = open("output.html", "w", encoding="utf-8")
+
+output_file.write('\
+<!DOCTYPE html>\n\
+<html>\n\
+	<head>\n\
+		<style>\n\
+			body {\n\
+				font-family: sans-serif;\n\
+			}\n\
+			\n\
+			table {\n\
+				color: #222;\n\
+				border-collapse: collapse;\n\
+			}\n\
+			\n\
+			tr, th, td {\n\
+				border: 1px solid #a2a9b1;\n\
+				padding: 0.2em 0.4em;\n\
+			}\n\
+			\n\
+			.pwned {\n\
+				background-color: #ffb6c1\n\
+			}\n\
+			\n\
+			th {\n\
+				background-color: #eaecf0;\n\
+				text-align: center;\n\
+			}\n\
+			\n\
+				a:hover, a:visited, a:link, a:active {\n\
+				text-decoration: none;\n\
+			}\n\
+		</style>\n\
+	</head>\n\
+\n\
+	<body>\n\
+		<table>\n\
+			<tr>\n\
+				<th>ID</th>\n\
+				<th>Name</th>\n\
+				<th>Status</th>\n\
+				<th>Type</th>\n\
+				<th>BanDays</th>\n\
+				<th>LogDays</th>\n\
+				<th>Profile</th>\n\
+			</tr>\n\
+')
+
+numbanned = 0
+
+for i in range(len(ids)):
+	try:
+		for summary in summaries['response']['players']:
+			if summary['steamid'] == str(ids[i]):
+				break
+	except:
+		continue
+		
+	try:
+		for ban in bans['players']:
+			if ban['SteamId'] == str(ids[i]):
+				break
+	except:
+		continue
+
+	status  = ""
+	bantype = ""
+	bandays = ""
+	
+	if ban['VACBanned']:
+		status  = "Pwned"
+		bantype = "VAC"
+		bandays = str(ban['DaysSinceLastBan'])
+		numbanned += 1
+	
+	if ban['NumberOfGameBans'] > 0:
+		status  = "Pwned"
+		bantype = "Gameban"
+		bandays = str(ban['DaysSinceLastBan'])
+		numbanned += 1
+	
+	name = summary['personaname']
+	name = name.replace("<", "&lt;") # escape html tag names
+	name = name.replace(">", "&gt;")
+	
+	logdays = str(int((time.time() - summary['lastlogoff']) / 86400)) # length of a day in epoch
+	
+	line_start = '				<td>' if status != "Pwned" else '				<td class="pwned">'
+	
+	output_file.write('			<tr>\n')
+	output_file.write(line_start + '<a target="_blank" href="' + 'https://steamcommunity.com/profiles/' + str(ids[i]) + '">' + str(ids[i]) + '</a></td>\n')
+	output_file.write(line_start + name + '</td>\n')
+	output_file.write(line_start + status + '</td>\n')
+	output_file.write(line_start + bantype + '</td>\n')
+	output_file.write(line_start + bandays + '</td>\n')
+	output_file.write(line_start + logdays + '</td>\n')
+	output_file.write(line_start + '<a target="_blank" href="' + 'https://steamcommunity.com/profiles/' + str(ids[i]) + '"><img src=' + summary['avatarmedium'] + ">"+ '</img></td>\n')
+	output_file.write('			</tr>\n')
+	
+	i += 1
+
+output_file.write('\
+		</table>\n\
+		' + str(numbanned) + '/' + str(len(ids)) + ' banned\n\
+	</body>\n\
+\
+</html>\n')
--- a/python/web/sheethost_scraper.py
+++ b/python/web/sheethost_scraper.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+import requests
+import bs4
+import sys
+
+if len(sys.argv) < 4:
+	print('Usage: ' + sys.argv[0] + ' [login] [password] [page name]')
+	exit(1)
+
+login = sys.argv[1]
+password = sys.argv[2]
+page_name = sys.argv[3]
+
+def download_sheet(s, url):
+	page = s.get(url)
+
+	try:
+		page.raise_for_status()
+	except:
+		print("Couldn't get %s" % url)
+		return
+
+	soup = bs4.BeautifulSoup(page.text, 'html.parser')
+
+	links = soup.select('a')
+	for link in links:
+		if '.pdf' in link.text:
+			with open(link.text[1:link.text.find('.pdf') + 4], 'wb') as f:
+				file = s.get(link.attrs['href'])
+
+				try:
+					page.raise_for_status()
+				except:
+					print("Couldn't get %s" % link.text)
+					return
+
+				for chunk in file.iter_content(100000):
+					f.write(chunk)
+
+with requests.session() as s:
+	login = s.post('https://hi10anime.com/wp-login.php', { 'login':login, 'password':password })
+	login.raise_for_status()
+
+	if not 'You have successfully logged in. Welcome back!' in login.text:
+		print("Couldn't log in")
+		exit(1)
+
+	page = s.get('https://sheet.host/user/%s/sheets' % page_name)
+	page.raise_for_status()
+
+	soup = bs4.BeautifulSoup(page.text, 'html.parser')
+
+	titles = soup.select('.score-title')
+
+	for title in titles:
+		print('Getting %s' % title.text)
+		download_sheet(s, title.attrs['href'])
--- a/python/web/subscomru_scraper.py
+++ b/python/web/subscomru_scraper.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+import requests
+import bs4
+
+def download_file(url):
+	filename = url[url.rfind('/') + 1:]
+
+	print('Downloading %s' % filename)
+
+	file = requests.get(url)
+
+	try:
+		file.raise_for_status()
+	except:
+		open(filename + '.failed', 'w')
+
+	with open(filename, 'wb') as f:
+		for chunk in file.iter_content(100000):
+			f.write(chunk)
+
+		f.close()
+
+def get_file_name(url):
+	page = requests.get(url)
+	page.raise_for_status()
+
+	soup = bs4.BeautifulSoup(page.text, "html.parser")
+
+	cells = soup.select('td.even') # gay retardness
+	for cell in cells:
+		text = cell.getText()
+
+		if '.rar' in text or '.zip' in text or '.7z' in text:
+			return text
+
+
+def scrape_site(url):
+	# split the url to use later for constructing new urls
+	base_url = url[:url.rfind('/') + 1]
+	url = url[url.rfind('/') + 1:]
+
+	while True:
+		print('Getting %s' % url)
+
+		page = requests.get(base_url + url)
+		page.raise_for_status() # throw on fail
+
+		soup = bs4.BeautifulSoup(page.text, "html.parser")
+
+		titles = soup.select('a[title]')
+		for title in titles:
+			link = title.attrs['href']
+
+			if 'id' in link and not 'dl' in link: # find content links
+				print('Found %s' % title.attrs['title'])
+				download_file(base_url + 'sub/enganime/' + get_file_name(base_url + link))
+
+		next_link = soup.select('span.pagenav_next > a')
+		if len(next_link) == 0:
+			print('End of site')
+			break
+
+		url = next_link[0].attrs['href']
+
+scrape_site('http://subs.com.ru/list.php?c=enganime&p=5&w=asc&d=1')