Python学习

http://stitchpanorama.sourceforge.net/Python/svd.py
最近学习写了一个抓取网页的程序,非常简单,没有考虑到referer以及使用代理,代码如下:
#!/usr/local/python/python

# -*- coding: iso-8859-1 -*-

# Cnangel 2007/11/24 it is crawler

import sys
import os
import re
import urllib2

print "Content-type:text/html\015\012\015\012"

path = "F:/www/test.huhoo.net/public_html/python"

def gethtml(url):
    try:
        fp = urllib2.urlopen(url)
    except:
        print "Can't open this url"
        return []
    while 1:
        line = fp.read()
        if not line:
            break
        info = line
    fp.close()
    return info

def savefile(filename, content, method = 'wb'):
    op = open(filename, method)
    op.write(content)
    op.close()
    return 1

def readfile(filename, method = 'rb'):
    fp = open(filename, method)
    while True:
        line = fp.readline()
#        if len(line) == 0:
        if not line:
            break
        info = line
    fp.close()
    return info

def findurl(content):
    urlres = re.compile('http:\/\/[^\">\s\']+')
    urls = urlres.findall(content)
    return urls
   
def crawler(firsturl, times):
    urls = []
    urls.append(firsturl)
    i = 0
    while True:
        if i > times:
            break
        if len(urls) > 0:
            url = urls.pop(0)
            print "%s\t%d<br>\n" % (url, len(urls))
            content = gethtml(url)
            if not content:
                continue
            savefile("%s/urls/%s" % (path, str(i)), content)
            i = i + 1
            if len(urls) < times:
                urllist = findurl(content)
                for url in urllist:
                    if urls.count(url) == 0:
                        urls.append(url)
        else:
            break
    return 1

crawler('http://www.google.com', 10)
这个是写的一个cgi程序,只要将content-type头去掉作为脚本运行就可以了。



Monthly Archives

Pages

Powered by Movable Type 7.7.2

About this Entry

This page contains a single entry by Cnangel published on September 19, 2007 6:39 PM.

蜘蛛抓取影响了我 was the previous entry in this blog.

一段有趣的js is the next entry in this blog.

Find recent content on the main index or look in the archives to find all content.