Python学习

http://stitchpanorama.sourceforge.net/Python/svd.py

最近学习写了一个抓取网页的程序，非常简单，没有考虑到referer以及使用代理，代码如下：

#!/usr/local/python/python

# -*- coding: iso-8859-1 -*-

# Cnangel 2007/11/24 it is crawler

import sys
import os
import re
import urllib2

print "Content-type:text/html\015\012\015\012"

path = "F:/www/test.huhoo.net/public_html/python"

def gethtml(url):
    try:
        fp = urllib2.urlopen(url)
    except:
        print "Can't open this url"
        return []
    while 1:
        line = fp.read()
        if not line:
            break
        info = line
    fp.close()
    return info

def savefile(filename, content, method = 'wb'):
    op = open(filename, method)
    op.write(content)
    op.close()
    return 1

def readfile(filename, method = 'rb'):
    fp = open(filename, method)
    while True:
        line = fp.readline()
#        if len(line) == 0:
        if not line:
            break
        info = line
    fp.close()
    return info

def findurl(content):
    urlres = re.compile('http:\/\/[^\">\s\']+')
    urls = urlres.findall(content)
    return urls

def crawler(firsturl, times):
    urls = []
    urls.append(firsturl)
    i = 0
    while True:
        if i > times:
            break
        if len(urls) > 0:
            url = urls.pop(0)
            print "%s\t%d<br>\n" % (url, len(urls))
            content = gethtml(url)
            if not content:
                continue
            savefile("%s/urls/%s" % (path, str(i)), content)
            i = i + 1
            if len(urls) < times:
                urllist = findurl(content)
                for url in urllist:
                    if urls.count(url) == 0:
                        urls.append(url)
        else:
            break
    return 1

crawler('http://www.google.com', 10)

这个是写的一个cgi程序，只要将content-type头去掉作为脚本运行就可以了。

Categories:

Tags:

Categories

Monthly Archives

Pages

Search

About this Entry