libxml2库是干什么的?很多人就开始说,是搞xml的、解析xml格式的、读取xml文件的......
其实说的都不错,但是对libxml2库的理解狭隘了一点。
libxml2现在不仅仅可以解析XML(EXtensible Markup Language)格式,包括HTML(HyperText Markup Language)以及现时常用的超文本格式的最高层次标准 SGML(Standard Generalized Markup Language) 都可以解析。
其实说的都不错,但是对libxml2库的理解狭隘了一点。
libxml2现在不仅仅可以解析XML(EXtensible Markup Language)格式,包括HTML(HyperText Markup Language)以及现时常用的超文本格式的最高层次标准 SGML(Standard Generalized Markup Language) 都可以解析。
下面举几个例子来说明:
1,libxml2读取文件或者链接:
其实libxml2可以通过xmlCtxtReadMemory等函数进行解析,但是这个流必须是标准的xml格式,应用范围相对狭隘一些,这里结合libcurl来获取<a href=""></a>中的url信息:
1,libxml2读取文件或者链接:
#include <cstdio>2,libxml2解析字符流(结合libcurl)
#include <cstring>
#include <cstdlib>
#include <iostream>
#include <string>
#include <libxml/parser.h>
#define BUFFERSIZE 1024 * 1024 * 3
#define READALINE 1024 * 1024 * 1
using namespace std;
int main(int argc, char *argv[])
{
if (argc != 2) {
printf("Usgae: %s <Url or File>\n", argv[0]);
exit(0);
}
char *file = argv[1];
xmlParserCtxtPtr ctxt;
xmlDocPtr doc;
xmlNodePtr cur;
// 创建一个Parser
ctxt = xmlNewParserCtxt();
// 读取链接流或文件流
doc = xmlCtxtReadFile(ctxt, file, "GBK", XML_PARSE_DTDATTR|XML_PARSE_NOERROR);
if (doc == NULL) {
printf("Can't parse the content: %s\n", file);
return 0;
}
// 获取根节点,此时的根节点可以通过cur->name获取
cur = xmlDocGetRootElement(doc);
if (cur == NULL) {
printf("Can't get the root element: %s\n", file);
xmlFreeDoc(doc);
xmlFreeParserCtxt(ctxt);
return 0;
}
// 获取子节点,此时的子节点也可以通过cur->name获取
while (cur != NULL) {
if (!xmlStrcmp(cur->name, (const xmlChar *)"news")){
xmlChar *key;
// 继续获取下个子节点的字节点
xmlNodePtr l_cur = cur->xmlChildrenNode;
while (l_cur != NULL) {
if (!xmlStrcmp(l_cur->name, (const xmlChar *)"title")) {
key = xmlNodeListGetString(doc, l_cur->xmlChildrenNode, 1);
printf("title: %s\n", key);
xmlFree(key);
} else if (!xmlStrcmp(l_cur->name, (const xmlChar *)"content")) {
key = xmlNodeListGetString(doc, l_cur->xmlChildrenNode, 1);
printf("content: %s\n", key);
xmlFree(key);
}
l_cur = l_cur->next;
}
}
// 继续获取下个子节点
cur = cur->next;
}
// 释放资源,很多网上的教程是没有释放资源或者释放得不彻底
xmlFreeDoc(doc);
xmlFreeParserCtxt(ctxt);
xmlCleanupParser();
return 0;
}
其实libxml2可以通过xmlCtxtReadMemory等函数进行解析,但是这个流必须是标准的xml格式,应用范围相对狭隘一些,这里结合libcurl来获取<a href=""></a>中的url信息:
#include <iostream>关于libxml2就先介绍到这,其实这里面还有很多很多的小技巧,还需要各位去挖掘哟~! :)
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <string>
#include <vector>
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
using namespace std;
#ifdef _MSC_VER
#define COMPARE(a, b) (!stricmp((a), (b)))
#else
#define COMPARE(a, b) (!strcasecmp((a), (b)))
#endif
typedef struct LinkStringDefined
{
string url;
string anthor_text;
} LinkString;
typedef struct ContextDefined
{
ContextDefined(): addTitle(false) { }
bool addTitle;
string title;
string url;
vector<LinkString> terms;
} Context;
static char errorBuffer[CURL_ERROR_SIZE];
static string buffer;
static int writer(char *, size_t, size_t, string *);
static bool init(CURL *&, char *);
static void parseHtml(const string &, vector<LinkString> &);
static void StartElement(void *, const xmlChar *, const xmlChar **);
static void EndElement(void *, const xmlChar *);
static void Characters(void *, const xmlChar *, int);
static void CdataBlock(void *, const xmlChar *, int);
int main(int argc, char* argv[])
{
CURL *conn = NULL;
CURLcode code;
vector<LinkString> arr;
if (argc != 2 && argc != 3)
{
fprintf(stderr, "Usage: %s <url> <null>\n", argv[0]);
exit(EXIT_FAILURE);
}
curl_global_init(CURL_GLOBAL_DEFAULT);
if (!init(conn, argv[1]))
{
fprintf(stderr, "Connection initializion failed\n");
exit(EXIT_FAILURE);
}
code = curl_easy_perform(conn);
curl_easy_cleanup(conn);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
exit(EXIT_FAILURE);
}
parseHtml(buffer, arr);
int arr_size = arr.size();
for(int i = 0; i < arr_size; i ++)
{
cout << arr[i].anthor_text << "\t" << arr[i].url << endl;
}
return 0;
}
static bool init(CURL *&conn, char *url)
{
CURLcode code;
conn = curl_easy_init();
if (conn == NULL)
{
fprintf(stderr, "Failed to create CURL connection\n");
exit(EXIT_FAILURE);
}
code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to set error buffer [%d]\n", code);
return false;
}
code = curl_easy_setopt(conn, CURLOPT_URL, url);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
return false;
}
code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
return false;
}
code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
return false;
}
code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
return false;
}
return true;
}
static int writer(char *data, size_t size, size_t nmemb, string *writerData)
{
unsigned long long sizes = size * nmemb;
if (writerData == NULL) return 0;
writerData->append(data, sizes);
return sizes;
}
// 关键点:自己可以定义节点形式(开始和结束位置)
static htmlSAXHandler saxHandler =
{
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
StartElement,
EndElement,
NULL,
Characters,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
CdataBlock,
NULL
};
static void parseHtml(const string &html, vector<LinkString> &arr)
{
htmlParserCtxtPtr ctxt;
Context context;
ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", XML_CHAR_ENCODING_NONE);
htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
htmlParseChunk(ctxt, "", 0, 1);
htmlFreeParserCtxt(ctxt);
arr = context.terms;
}
static void StartElement(void *voidContext, const xmlChar *name, const xmlChar **attributes)
{
// int sz_att = sizeof(**attributes) / sizeof(xmlChar);
Context *context = (Context *)voidContext;
if (COMPARE((char *)name, "a"))
{
context->url = (char *)attributes[1];
context->title = "";
context->addTitle = true;
}
}
static void EndElement(void *voidContext, const xmlChar *name)
{
Context *context = (Context *)voidContext;
if (COMPARE((char *)name, "a"))
{
context->url = "";
context->addTitle = false;
}
}
static void handleCharacters(Context *context, const xmlChar *chars, int length)
{
LinkString linkString;
if (context->addTitle)
{
context->title.append((char *)chars, length);
linkString.anthor_text = context->title;
linkString.url = context->url;
context->terms.push_back(linkString);
}
}
static void Characters(void *voidContext, const xmlChar *chars, int length)
{
Context *context = (Context *)voidContext;
handleCharacters(context, chars, length);
}
static void CdataBlock(void *voidContext, const xmlChar *chars, int length)
{
Context *context = (Context *)voidContext;
handleCharacters(context, chars, length);
}