diff options
author | Daniel Stenberg <daniel@haxx.se> | 2005-02-02 19:25:37 +0000 |
---|---|---|
committer | Daniel Stenberg <daniel@haxx.se> | 2005-02-02 19:25:37 +0000 |
commit | 6b81cf4bc9e0fa65bfef06f8dff8ad292b2fcb47 (patch) | |
tree | 28b32b5a398530f61d697045d3957a1b493648c2 | |
parent | 0d9301539e5ec58371c93f8e5f5346a5a4f5f43c (diff) |
HTML parsing example with libtidy, by Jeff Pohlmeyer
-rw-r--r-- | docs/examples/htmltidy.c | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/docs/examples/htmltidy.c b/docs/examples/htmltidy.c new file mode 100644 index 000000000..112a4e6b6 --- /dev/null +++ b/docs/examples/htmltidy.c @@ -0,0 +1,118 @@ +/***************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ \| | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * \___|\___/|_| \_\_____| + * + * $Id$ + * + * Download a document and use libtidy to parse the HTML. + * Written by Jeff Pohlmeyer + * + * LibTidy => http://tidy.sourceforge.net + * + * gcc -Wall -I/usr/local/include tidycurl.c -lcurl -ltidy -o tidycurl + * + */ + +#include <stdio.h> +#include <tidy/tidy.h> +#include <tidy/buffio.h> +#include <curl/curl.h> + +/* curl write callback, to fill tidy's input buffer... */ +uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out) +{ + uint r; + r = size * nmemb; + tidyBufAppend( out, in, r ); + return(r); +} + +/* Traverse the document tree */ +void dumpNode(TidyDoc doc, TidyNode tnod, int indent ) +{ + TidyNode child; + for ( child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) + { + ctmbstr name = tidyNodeGetName( child ); + if ( name ) + { + /* if it has a name, then it's an HTML tag ... */ + TidyAttr attr; + printf( "%*.*s%s ", indent, indent, "<", name); + /* walk the attribute list */ + for ( attr=tidyAttrFirst(child); attr; attr=tidyAttrNext(attr) ) { + printf(tidyAttrName(attr)); + tidyAttrValue(attr)?printf("=\"%s\" ", + tidyAttrValue(attr)):printf(" "); + } + printf( ">\n"); + } + else { + /* if it doesn't have a name, then it's probably text, cdata, etc... */ + TidyBuffer buf; + tidyBufInit(&buf); + tidyNodeGetText(doc, child, &buf); + printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:""); + tidyBufFree(&buf); + } + dumpNode( doc, child, indent + 4 ); /* recursive */ + } +} + + +int main(int argc, char **argv ) +{ + CURL *curl; + char curl_errbuf[CURL_ERROR_SIZE]; + TidyDoc tdoc; + TidyBuffer docbuf = {0}; + TidyBuffer tidy_errbuf = {0}; + int err; + if ( argc == 2) { + curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_URL, argv[1]); + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf); + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, no); + curl_easy_setopt(curl, CURLOPT_VERBOSE, yes); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); + + tdoc = tidyCreate(); + tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ + tidyOptSetInt(tdoc, TidyWrapLen, 4096); + tidySetErrorBuffer( tdoc, &tidy_errbuf ); + tidyBufInit(&docbuf); + + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf); + err=curl_easy_perform(curl); + if ( !err ) { + err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */ + if ( err >= 0 ) { + err = tidyCleanAndRepair(tdoc); /* fix any problems */ + if ( err >= 0 ) { + err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ + if ( err >= 0 ) { + dumpNode( tdoc, tidyGetRoot(tdoc), 0 ); /* walk the tree */ + fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */ + } + } + } + } + else + fprintf(stderr, "%s\n", curl_errbuf); + + /* clean-up */ + curl_easy_cleanup(curl); + tidyBufFree(&docbuf); + tidyBufFree(&tidy_errbuf); + tidyRelease(tdoc); + return(err); + + } + else + printf( "usage: %s <url>\n", argv[0] ); + + return(0); +} |