Having problems with Python proxy scrapper

Status
Not open for further replies.

Greendragon

Active Member
63
2015
7
0
Hi, I have found this python proxy scrapper online but I am having problems running it. Is anyone able to get this to work?


Code:
import urllibfrom bs4 import BeautifulSoup
import cookielib
import base64
import time
import datetime




numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]


def proxylist():
	proxylistUrls = ["http://proxy-list.org/english/index.php?p=1",
					 "http://proxy-list.org/english/index.php?p=2",
					 "http://proxy-list.org/english/index.php?p=3",
					 "http://proxy-list.org/english/index.php?p=4",
					 "http://proxy-list.org/english/index.php?p=5",
					 "http://proxy-list.org/english/index.php?p=6",
					 "http://proxy-list.org/english/index.php?p=7",
					 "http://proxy-list.org/english/index.php?p=8",
					 "http://proxy-list.org/english/index.php?p=9",
					 "http://proxy-list.org/english/index.php?p=10"]


	proxies = []
	for link in proxylistUrls:
		print "Grabbing: " + link
		response = urllib.urlopen(link)
		html = response.read()


		soup = BeautifulSoup(html)
		for link in soup.find_all('li'):
			temp = link.find(text=True)
			if "." in temp and "bit" not in temp:
				proxies.append(temp)


	return proxies


def usproxy():
	templs = []
	url = "http://www.us-proxy.org/"
	print "Grabbing: " + url
	response = urllib.urlopen(url)
	html = response.read()


	soup = BeautifulSoup(html)
	for link in soup.find_all('td'):
		temp = link.find(text=True)
		for num in numbers:
			if num in temp and "ago" not in temp:
				templs.append(temp)
				break


	proxies = []


	x = 0


	while x < len(templs):
		proxies.append(templs[x] + ":" + templs[x + 1])
		x += 2


	return proxies


def freeproxylist():
	url = "http://free-proxy-list.net/"
	print "Grabbing: " + url
	response = urllib.urlopen(url)
	html = response.read()


	templs = []


	soup = BeautifulSoup(html)
	for link in soup.find_all('td'):
		temp = link.find(text=True)
		for num in numbers:
			if num in temp and "ago" not in temp:
				templs.append(temp)
				break


	proxies = []


	x = 0
	while  x < len(templs) / 2:
		proxies.append(templs[x] + ":" + templs[x + 1])
		x += 2


	return proxies


def coolproxy():
	coolProxyUrls = ["http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:1",
					 "http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:2",
					 "http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:3",
					 "http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:4",
					 "http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:5",
					 "http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:6",
					 "http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:7",
					 "http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:8",
					 "http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:9",
				


	proxies = []
	for url in coolProxyUrls:
		print "Grabbing: " + url
		response = urllib.urlopen(url)
		html = response.read()


		templs = []


		soup = BeautifulSoup(html)
		for link in soup.find_all('td'):
			temp = link.find(text=True)
			for num in numbers:
				if temp is not None and num in temp and ":" not in temp:
					templs.append(temp)
					break




		i = 0
		while i < len(templs):
			tmp = templs[i].replace('document.write(Base64.decode(str_rot13("', '')
			tmp = tmp.replace('")))', '')
			tmp = tmp.decode('rot13').decode('base64')
			proxies.append(tmp + ":" + templs[i + 1])
			i += 5


	return proxies




proxies = []
proxies += proxylist()
proxies += usproxy()
proxies += freeproxylist()
proxies += coolproxy()


print "Proxies grabbed: " + str(len(proxies))


ts = time.time()
dt = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H-%M-%S')


fout = open("proxylist-" + dt + ".txt", "w")
for line in proxies:
	fout.write(line + '\n')
fout.close()


print "Saved to proxylist-" + dt + ".txt"


print "Done"
 
2 comments
Status
Not open for further replies.
Back
Top