如何使用libxml2库?

    libxml2库是干什么的?很多人就开始说,是搞xml的、解析xml格式的、读取xml文件的......
其实说的都不错,但是对libxml2库的理解狭隘了一点。

     libxml2现在不仅仅可以解析XML(EXtensible Markup Language)格式,包括HTML(HyperText Markup Language)以及现时常用的超文本格式的最高层次标准 SGML(Standard Generalized Markup Language) 都可以解析。
下面举几个例子来说明:
1,libxml2读取文件或者链接:
#include <cstdio>
#include <cstring>
#include <cstdlib>
#include <iostream>
#include <string>
#include <libxml/parser.h>

#define BUFFERSIZE 1024 * 1024 * 3
#define READALINE 1024 * 1024 * 1

using namespace std;

int main(int argc, char *argv[])
{
    if (argc != 2) {
        printf("Usgae: %s <Url or File>\n", argv[0]);
        exit(0);
    }
    char *file = argv[1];
    xmlParserCtxtPtr ctxt;
    xmlDocPtr doc;
    xmlNodePtr cur;
    // 创建一个Parser
    ctxt = xmlNewParserCtxt();
    // 读取链接流或文件流
    doc = xmlCtxtReadFile(ctxt, file, "GBK", XML_PARSE_DTDATTR|XML_PARSE_NOERROR);
    if (doc == NULL) {
        printf("Can't parse the content: %s\n", file);
        return 0;
    }
    // 获取根节点,此时的根节点可以通过cur->name获取
    cur = xmlDocGetRootElement(doc);
    if (cur == NULL) {
        printf("Can't get the root element: %s\n", file);
        xmlFreeDoc(doc);
        xmlFreeParserCtxt(ctxt);
        return 0;
    }
    // 获取子节点,此时的子节点也可以通过cur->name获取
    while (cur != NULL) {
        if (!xmlStrcmp(cur->name, (const xmlChar *)"news")){
            xmlChar *key;
            // 继续获取下个子节点的字节点
            xmlNodePtr l_cur = cur->xmlChildrenNode;
            while (l_cur != NULL) {
                if (!xmlStrcmp(l_cur->name, (const xmlChar *)"title")) {
                    key = xmlNodeListGetString(doc, l_cur->xmlChildrenNode, 1);
                    printf("title: %s\n", key);
                    xmlFree(key);
                } else if (!xmlStrcmp(l_cur->name, (const xmlChar *)"content")) {
                    key = xmlNodeListGetString(doc, l_cur->xmlChildrenNode, 1);
                    printf("content: %s\n", key);
                    xmlFree(key);
                }
                l_cur = l_cur->next;
            }
        }
        // 继续获取下个子节点
        cur = cur->next;
    }

    // 释放资源,很多网上的教程是没有释放资源或者释放得不彻底
    xmlFreeDoc(doc);
    xmlFreeParserCtxt(ctxt);
    xmlCleanupParser();
    return 0;
}
2,libxml2解析字符流(结合libcurl)
其实libxml2可以通过xmlCtxtReadMemory等函数进行解析,但是这个流必须是标准的xml格式,应用范围相对狭隘一些,这里结合libcurl来获取<a href=""></a>中的url信息:
#include <iostream>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <string>
#include <vector>
#include <curl/curl.h>
#include <libxml/HTMLparser.h>

using namespace std;

#ifdef _MSC_VER
#define COMPARE(a, b) (!stricmp((a), (b)))
#else
#define COMPARE(a, b) (!strcasecmp((a), (b)))
#endif

typedef struct LinkStringDefined
{
    string url;
    string anthor_text;
} LinkString;

typedef struct ContextDefined
{
    ContextDefined(): addTitle(false) { }
    bool addTitle;
    string title;
    string url;
    vector<LinkString> terms;
} Context;

static char errorBuffer[CURL_ERROR_SIZE];
static string buffer;
static int writer(char *, size_t, size_t, string *);
static bool init(CURL *&, char *);
static void parseHtml(const string &, vector<LinkString> &);
static void StartElement(void *, const xmlChar *, const xmlChar **);
static void EndElement(void *, const xmlChar *);
static void Characters(void *, const xmlChar *, int);
static void CdataBlock(void *, const xmlChar *, int);

int main(int argc, char* argv[])
{
    CURL *conn = NULL;
    CURLcode code;
    vector<LinkString> arr;
    if (argc != 2 && argc != 3)
    {
        fprintf(stderr, "Usage: %s <url> <null>\n", argv[0]);
        exit(EXIT_FAILURE);
    }
    curl_global_init(CURL_GLOBAL_DEFAULT);
    if (!init(conn, argv[1]))
    {
        fprintf(stderr, "Connection initializion failed\n");
        exit(EXIT_FAILURE);
    }
    code = curl_easy_perform(conn);
    curl_easy_cleanup(conn);
    if (code != CURLE_OK)
    {
        fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
        exit(EXIT_FAILURE);
    }
    parseHtml(buffer, arr);
    int arr_size = arr.size();
    for(int i = 0; i < arr_size; i ++)
    {
        cout << arr[i].anthor_text << "\t" << arr[i].url << endl;
    }
    return 0;
}

static bool init(CURL *&conn, char *url)
{
    CURLcode code;
    conn = curl_easy_init();
    if (conn == NULL)
    {
        fprintf(stderr, "Failed to create CURL connection\n");
        exit(EXIT_FAILURE);
    }
    code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
    if (code != CURLE_OK)
    {
        fprintf(stderr, "Failed to set error buffer [%d]\n", code);
        return false;
    }
    code = curl_easy_setopt(conn, CURLOPT_URL, url);
    if (code != CURLE_OK)
    {
        fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
        return false;
    }
    code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1);
    if (code != CURLE_OK)
    {
        fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
        return false;
    }
    code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
    if (code != CURLE_OK)
    {
        fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
        return false;
    }
    code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
    if (code != CURLE_OK)
    {
        fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
        return false;
    }
    return true;
}

static int writer(char *data, size_t size, size_t nmemb, string *writerData)
{
    unsigned long long sizes = size * nmemb;
    if (writerData == NULL) return 0;
    writerData->append(data, sizes);
    return sizes;
}

// 关键点:自己可以定义节点形式(开始和结束位置)
static htmlSAXHandler saxHandler =
{
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    StartElement,
    EndElement,
    NULL,
    Characters,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    CdataBlock,
    NULL
};

static void parseHtml(const string &html, vector<LinkString> &arr)
{
    htmlParserCtxtPtr ctxt;
    Context context;
    ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", XML_CHAR_ENCODING_NONE);
    htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
    htmlParseChunk(ctxt, "", 0, 1);
    htmlFreeParserCtxt(ctxt);
    arr = context.terms;
}

static void StartElement(void *voidContext, const xmlChar *name, const xmlChar **attributes)
{
//  int sz_att = sizeof(**attributes) / sizeof(xmlChar);
    Context *context = (Context *)voidContext;
    if (COMPARE((char *)name, "a"))
    {
        context->url = (char *)attributes[1];
        context->title = "";
        context->addTitle = true;
    }
}

static void EndElement(void *voidContext, const xmlChar *name)
{
    Context *context = (Context *)voidContext;
    if (COMPARE((char *)name, "a"))
    {
        context->url = "";
        context->addTitle = false;
    }
}

static void handleCharacters(Context *context, const xmlChar *chars, int length)
{
    LinkString linkString;
    if (context->addTitle)
    {
        context->title.append((char *)chars, length);
        linkString.anthor_text = context->title;
        linkString.url = context->url;
        context->terms.push_back(linkString);
    }
}

static void Characters(void *voidContext, const xmlChar *chars, int length)
{
    Context *context = (Context *)voidContext;
    handleCharacters(context, chars, length);
}

static void CdataBlock(void *voidContext, const xmlChar *chars, int length)
{
    Context *context = (Context *)voidContext;
    handleCharacters(context, chars, length);
}
关于libxml2就先介绍到这,其实这里面还有很多很多的小技巧,还需要各位去挖掘哟~! :)

Monthly Archives

Pages

Powered by Movable Type 7.7.2

About this Entry

This page contains a single entry by Cnangel published on January 18, 2009 4:42 PM.

关于vim里面二次编码问题 was the previous entry in this blog.

C语言去除空白字符 is the next entry in this blog.

Find recent content on the main index or look in the archives to find all content.