From 883343ba63a376ce9326149b0476262993d73b08 Mon Sep 17 00:00:00 2001 From: Daniel Stenberg Date: Mon, 31 Jan 2005 18:22:40 +0000 Subject: HTML parsing (with libxml) example code by Lars Nilsson. --- docs/examples/Makefile.am | 2 +- docs/examples/htmltitle.cc | 297 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 298 insertions(+), 1 deletion(-) create mode 100644 docs/examples/htmltitle.cc (limited to 'docs/examples') diff --git a/docs/examples/Makefile.am b/docs/examples/Makefile.am index 34ac520a7..f9b2939c4 100644 --- a/docs/examples/Makefile.am +++ b/docs/examples/Makefile.am @@ -10,7 +10,7 @@ EXTRA_DIST = README curlgtk.c sepheaders.c simple.c postit2.c \ post-callback.c multi-app.c multi-double.c multi-single.c \ multi-post.c fopen.c simplepost.c makefile.dj curlx.c https.c \ multi-debugcallback.c fileupload.c getinfo.c ftp3rdparty.c debug.c \ - anyauthput.c + anyauthput.c htmltitle.cc all: @echo "done" diff --git a/docs/examples/htmltitle.cc b/docs/examples/htmltitle.cc new file mode 100644 index 000000000..7d0cc4c3c --- /dev/null +++ b/docs/examples/htmltitle.cc @@ -0,0 +1,297 @@ +/***************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ \| | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * \___|\___/|_| \_\_____| + * + * $Id$ + */ + +// Get a web page, parse it with libxml. +// +// Written by Lars Nilsson +// +// GNU C++ compile command line suggestion (edit paths accordingly): +// +// g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cc \ +// -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2 + +#include +#include +#include +#include +#include +#include + +// +// Case-insensitive string comparison +// + +#ifdef _MSC_VER +#define COMPARE(a, b) (!stricmp((a), (b))) +#else +#define COMPARE(a, b) (!strcasecmp((a), (b))) +#endif + +// +// libxml callback context structure +// + +struct Context +{ + Context(): addTitle(false) { } + + bool addTitle; + std::string title; +}; + +// +// libcurl variables for error strings and returned data + +static char errorBuffer[CURL_ERROR_SIZE]; +static std::string buffer; + +// +// libcurl write callback function +// + +static int writer(char *data, size_t size, size_t nmemb, + std::string *writerData) +{ + if (writerData == NULL) + return 0; + + writerData->append(data, size*nmemb); + + return size * nmemb; +} + +// +// libcurl connection initialization +// + +static bool init(CURL *&conn, char *url) +{ + CURLcode code; + + conn = curl_easy_init(); + + if (conn == NULL) + { + fprintf(stderr, "Failed to create CURL connection\n"); + + exit(EXIT_FAILURE); + } + + code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer); + if (code != CURLE_OK) + { + fprintf(stderr, "Failed to set error buffer [%d]\n", code); + + return false; + } + + code = curl_easy_setopt(conn, CURLOPT_URL, url); + if (code != CURLE_OK) + { + fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer); + + return false; + } + + code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1); + if (code != CURLE_OK) + { + fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer); + + return false; + } + + code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer); + if (code != CURLE_OK) + { + fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer); + + return false; + } + + code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer); + if (code != CURLE_OK) + { + fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer); + + return false; + } + + return true; +} + +// +// libxml start element callback function +// + +static void startElement(void *voidContext, + const xmlChar *name, + const xmlChar **attributes) +{ + Context *context = (Context *)voidContext; + + if (COMPARE((char *)name, "TITLE")) + { + context->title = ""; + context->addTitle = true; + } +} + +// +// libxml end element callback function +// + +static void endElement(void *voidContext, + const xmlChar *name) +{ + Context *context = (Context *)voidContext; + + if (COMPARE((char *)name, "TITLE")) + context->addTitle = false; +} + +// +// Text handling helper function +// + +static void handleCharacters(Context *context, + const xmlChar *chars, + int length) +{ + if (context->addTitle) + context->title.append((char *)chars, length); +} + +// +// libxml PCDATA callback function +// + +static void characters(void *voidContext, + const xmlChar *chars, + int length) +{ + Context *context = (Context *)voidContext; + + handleCharacters(context, chars, length); +} + +// +// libxml CDATA callback function +// + +static void cdata(void *voidContext, + const xmlChar *chars, + int length) +{ + Context *context = (Context *)voidContext; + + handleCharacters(context, chars, length); +} + +// +// libxml SAX callback structure +// + +static htmlSAXHandler saxHandler = +{ + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + startElement, + endElement, + NULL, + characters, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + cdata, + NULL +}; + +// +// Parse given (assumed to be) HTML text and return the title +// + +static void parseHtml(const std::string &html, + std::string &title) +{ + htmlParserCtxtPtr ctxt; + Context context; + + ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", + XML_CHAR_ENCODING_NONE); + + htmlParseChunk(ctxt, html.c_str(), html.size(), 0); + htmlParseChunk(ctxt, "", 0, 1); + + htmlFreeParserCtxt(ctxt); + + title = context.title; +} + +int main(int argc, char *argv[]) +{ + CURL *conn = NULL; + CURLcode code; + std::string title; + + // Ensure one argument is given + + if (argc != 2) + { + fprintf(stderr, "Usage: %s \n", argv[0]); + + exit(EXIT_FAILURE); + } + + // Initialize CURL connection + + if (!init(conn, argv[1])) + { + fprintf(stderr, "Connection initializion failed\n"); + + exit(EXIT_FAILURE); + } + + // Retrieve content for the URL + + code = curl_easy_perform(conn); + if (code != CURLE_OK) + { + fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer); + + exit(EXIT_FAILURE); + } + + // Parse the (assumed) HTML code + + parseHtml(buffer, title); + + // Display the extracted title + + printf("Title: %s\n", title.c_str()); + + return EXIT_SUCCESS; +} -- cgit v1.2.3