diff options
author | Michał Kowalczyk <netuskowal@gmail.com> | 2012-10-18 16:45:51 +0200 |
---|---|---|
committer | Daniel Stenberg <daniel@haxx.se> | 2012-10-18 16:45:51 +0200 |
commit | 8ffc971138a4f23dcabf45197255a059bf9e1b52 (patch) | |
tree | 13ff3916a73f288f441bdc706acb88a6c5375124 | |
parent | f1d2e1850819f54d1c950989614da7445bdd457f (diff) |
href_extractor: example code extracting href elements
It does so in a streaming manner using the "Streaming HTML parser".
-rw-r--r-- | docs/examples/Makefile.inc | 2 | ||||
-rw-r--r-- | docs/examples/href_extractor.c | 86 |
2 files changed, 87 insertions, 1 deletions
diff --git a/docs/examples/Makefile.inc b/docs/examples/Makefile.inc index b548ebf7b..2b31f86fb 100644 --- a/docs/examples/Makefile.inc +++ b/docs/examples/Makefile.inc @@ -12,4 +12,4 @@ check_PROGRAMS = 10-at-a-time anyauthput cookie_interface debug fileupload \ COMPLICATED_EXAMPLES = curlgtk.c curlx.c htmltitle.cc cacertinmem.c \ ftpuploadresume.c ghiper.c hiperfifo.c htmltidy.c multithread.c \ opensslthreadlock.c sampleconv.c synctime.c threaded-ssl.c evhiperfifo.c \ - smooth-gtk-thread.c version-check.pl + smooth-gtk-thread.c version-check.pl href_extractor.c diff --git a/docs/examples/href_extractor.c b/docs/examples/href_extractor.c new file mode 100644 index 000000000..4181c8e24 --- /dev/null +++ b/docs/examples/href_extractor.c @@ -0,0 +1,86 @@ +/*************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ \| | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * \___|\___/|_| \_\_____| + * + * Copyright (C) 2012, Daniel Stenberg, <daniel@haxx.se>, et al. + * + * This software is licensed as described in the file COPYING, which + * you should have received as part of this distribution. The terms + * are also available at http://curl.haxx.se/docs/copyright.html. + * + * You may opt to use, copy, modify, merge, publish, distribute and/or sell + * copies of the Software, and permit persons to whom the Software is + * furnished to do so, under the terms of the COPYING file. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ***************************************************************************/ + +/* + * This example uses the "Streaming HTML parser" to extract the href pieces in + * a streaming manner from a downloaded HTML. Kindly donated by Michał + * Kowalczyk. + * + * The parser is found at + * http://code.google.com/p/streamhtmlparser/ + */ + +#include <stdio.h> +#include <curl/curl.h> +#include <htmlstreamparser.h> + + +static size_t write_callback(void *buffer, size_t size, size_t nmemb, + void *hsp) +{ + size_t realsize = size * nmemb, p; + for (p = 0; p < realsize; p++) { + html_parser_char_parse(hsp, ((char *)buffer)[p]); + if (html_parser_cmp_tag(hsp, "a", 1)) + if (html_parser_cmp_attr(hsp, "href", 4)) + if (html_parser_is_in(hsp, HTML_VALUE_ENDED)) { + html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0'; + printf("%s\n", html_parser_val(hsp)); + } + } + return realsize; +} + +int main(int argc, char *argv[]) +{ + char tag[1], attr[4], val[128]; + CURL *curl; + HTMLSTREAMPARSER *hsp; + + if (argc != 2) { + printf("Usage: %s URL\n", argv[0]); + return EXIT_FAILURE; + } + + curl = curl_easy_init(); + + hsp = html_parser_init(); + + html_parser_set_tag_to_lower(hsp, 1); + html_parser_set_attr_to_lower(hsp, 1); + html_parser_set_tag_buffer(hsp, tag, sizeof(tag)); + html_parser_set_attr_buffer(hsp, attr, sizeof(attr)); + html_parser_set_val_buffer(hsp, val, sizeof(val)-1); + + curl_easy_setopt(curl, CURLOPT_URL, argv[1]); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, hsp); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + + curl_easy_perform(curl); + + curl_easy_cleanup(curl); + + html_parser_cleanup(hsp); + + return EXIT_SUCCESS; +} |