Greendragon
Active Member
Hi, I have found this python proxy scrapper online but I am having problems running it. Is anyone able to get this to work?
Code:
import urllibfrom bs4 import BeautifulSoup
import cookielib
import base64
import time
import datetime
numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
def proxylist():
proxylistUrls = ["http://proxy-list.org/english/index.php?p=1",
"http://proxy-list.org/english/index.php?p=2",
"http://proxy-list.org/english/index.php?p=3",
"http://proxy-list.org/english/index.php?p=4",
"http://proxy-list.org/english/index.php?p=5",
"http://proxy-list.org/english/index.php?p=6",
"http://proxy-list.org/english/index.php?p=7",
"http://proxy-list.org/english/index.php?p=8",
"http://proxy-list.org/english/index.php?p=9",
"http://proxy-list.org/english/index.php?p=10"]
proxies = []
for link in proxylistUrls:
print "Grabbing: " + link
response = urllib.urlopen(link)
html = response.read()
soup = BeautifulSoup(html)
for link in soup.find_all('li'):
temp = link.find(text=True)
if "." in temp and "bit" not in temp:
proxies.append(temp)
return proxies
def usproxy():
templs = []
url = "http://www.us-proxy.org/"
print "Grabbing: " + url
response = urllib.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
for link in soup.find_all('td'):
temp = link.find(text=True)
for num in numbers:
if num in temp and "ago" not in temp:
templs.append(temp)
break
proxies = []
x = 0
while x < len(templs):
proxies.append(templs[x] + ":" + templs[x + 1])
x += 2
return proxies
def freeproxylist():
url = "http://free-proxy-list.net/"
print "Grabbing: " + url
response = urllib.urlopen(url)
html = response.read()
templs = []
soup = BeautifulSoup(html)
for link in soup.find_all('td'):
temp = link.find(text=True)
for num in numbers:
if num in temp and "ago" not in temp:
templs.append(temp)
break
proxies = []
x = 0
while x < len(templs) / 2:
proxies.append(templs[x] + ":" + templs[x + 1])
x += 2
return proxies
def coolproxy():
coolProxyUrls = ["http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:1",
"http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:2",
"http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:3",
"http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:4",
"http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:5",
"http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:6",
"http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:7",
"http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:8",
"http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:9",
proxies = []
for url in coolProxyUrls:
print "Grabbing: " + url
response = urllib.urlopen(url)
html = response.read()
templs = []
soup = BeautifulSoup(html)
for link in soup.find_all('td'):
temp = link.find(text=True)
for num in numbers:
if temp is not None and num in temp and ":" not in temp:
templs.append(temp)
break
i = 0
while i < len(templs):
tmp = templs[i].replace('document.write(Base64.decode(str_rot13("', '')
tmp = tmp.replace('")))', '')
tmp = tmp.decode('rot13').decode('base64')
proxies.append(tmp + ":" + templs[i + 1])
i += 5
return proxies
proxies = []
proxies += proxylist()
proxies += usproxy()
proxies += freeproxylist()
proxies += coolproxy()
print "Proxies grabbed: " + str(len(proxies))
ts = time.time()
dt = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H-%M-%S')
fout = open("proxylist-" + dt + ".txt", "w")
for line in proxies:
fout.write(line + '\n')
fout.close()
print "Saved to proxylist-" + dt + ".txt"
print "Done"