http://stitchpanorama.sourceforge.net/Python/svd.py
最近学习写了一个抓取网页的程序,非常简单,没有考虑到referer以及使用代理,代码如下:
#!/usr/local/python/python这个是写的一个cgi程序,只要将content-type头去掉作为脚本运行就可以了。
# -*- coding: iso-8859-1 -*-
# Cnangel 2007/11/24 it is crawler
import sys
import os
import re
import urllib2
print "Content-type:text/html\015\012\015\012"
path = "F:/www/test.huhoo.net/public_html/python"
def gethtml(url):
try:
fp = urllib2.urlopen(url)
except:
print "Can't open this url"
return []
while 1:
line = fp.read()
if not line:
break
info = line
fp.close()
return info
def savefile(filename, content, method = 'wb'):
op = open(filename, method)
op.write(content)
op.close()
return 1
def readfile(filename, method = 'rb'):
fp = open(filename, method)
while True:
line = fp.readline()
# if len(line) == 0:
if not line:
break
info = line
fp.close()
return info
def findurl(content):
urlres = re.compile('http:\/\/[^\">\s\']+')
urls = urlres.findall(content)
return urls
def crawler(firsturl, times):
urls = []
urls.append(firsturl)
i = 0
while True:
if i > times:
break
if len(urls) > 0:
url = urls.pop(0)
print "%s\t%d<br>\n" % (url, len(urls))
content = gethtml(url)
if not content:
continue
savefile("%s/urls/%s" % (path, str(i)), content)
i = i + 1
if len(urls) < times:
urllist = findurl(content)
for url in urllist:
if urls.count(url) == 0:
urls.append(url)
else:
break
return 1
crawler('http://www.google.com', 10)