Python学习

http://stitchpanorama.sourceforge.net/Python/svd.py
最近学习写了一个抓取网页的程序,非常简单,没有考虑到referer以及使用代理,代码如下:
#!/usr/local/python/python

# -*- coding: iso-8859-1 -*-

# Cnangel 2007/11/24 it is crawler

import sys
import os
import re
import urllib2

print "Content-type:text/html\015\012\015\012"

path = "F:/www/test.huhoo.net/public_html/python"

def gethtml(url):
    try:
        fp = urllib2.urlopen(url)
    except:
        print "Can't open this url"
        return []
    while 1:
        line = fp.read()
        if not line:
            break
        info = line
    fp.close()
    return info

def savefile(filename, content, method = 'wb'):
    op = open(filename, method)
    op.write(content)
    op.close()
    return 1

def readfile(filename, method = 'rb'):
    fp = open(filename, method)
    while True:
        line = fp.readline()
#        if len(line) == 0:
        if not line:
            break
        info = line
    fp.close()
    return info

def findurl(content):
    urlres = re.compile('http:\/\/[^\">\s\']+')
    urls = urlres.findall(content)
    return urls
   
def crawler(firsturl, times):
    urls = []
    urls.append(firsturl)
    i = 0
    while True:
        if i > times:
            break
        if len(urls) > 0:
            url = urls.pop(0)
            print "%s\t%d<br>\n" % (url, len(urls))
            content = gethtml(url)
            if not content:
                continue
            savefile("%s/urls/%s" % (path, str(i)), content)
            i = i + 1
            if len(urls) < times:
                urllist = findurl(content)
                for url in urllist:
                    if urls.count(url) == 0:
                        urls.append(url)
        else:
            break
    return 1

crawler('http://www.google.com', 10)
这个是写的一个cgi程序,只要将content-type头去掉作为脚本运行就可以了。



Categories

| | 评论(0)

发表评论

May 2010

            1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31          

关于此日记

此日记由 Cnangel 发表于 September 19, 2007 6:39 PM

此Blog上的上一篇日记蜘蛛抓取影响了我

此Blog上的下一篇日记一段有趣的js

首页归档页可以看到最新的日记和所有日记。

归档

Powered by Movable Type 5.02