From 8ffc971138a4f23dcabf45197255a059bf9e1b52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kowalczyk?= Date: Thu, 18 Oct 2012 16:45:51 +0200 Subject: href_extractor: example code extracting href elements It does so in a streaming manner using the "Streaming HTML parser". --- docs/examples/href_extractor.c | 86 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 docs/examples/href_extractor.c (limited to 'docs/examples/href_extractor.c') diff --git a/docs/examples/href_extractor.c b/docs/examples/href_extractor.c new file mode 100644 index 000000000..4181c8e24 --- /dev/null +++ b/docs/examples/href_extractor.c @@ -0,0 +1,86 @@ +/*************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ \| | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * \___|\___/|_| \_\_____| + * + * Copyright (C) 2012, Daniel Stenberg, , et al. + * + * This software is licensed as described in the file COPYING, which + * you should have received as part of this distribution. The terms + * are also available at http://curl.haxx.se/docs/copyright.html. + * + * You may opt to use, copy, modify, merge, publish, distribute and/or sell + * copies of the Software, and permit persons to whom the Software is + * furnished to do so, under the terms of the COPYING file. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ***************************************************************************/ + +/* + * This example uses the "Streaming HTML parser" to extract the href pieces in + * a streaming manner from a downloaded HTML. Kindly donated by MichaƂ + * Kowalczyk. + * + * The parser is found at + * http://code.google.com/p/streamhtmlparser/ + */ + +#include +#include +#include + + +static size_t write_callback(void *buffer, size_t size, size_t nmemb, + void *hsp) +{ + size_t realsize = size * nmemb, p; + for (p = 0; p < realsize; p++) { + html_parser_char_parse(hsp, ((char *)buffer)[p]); + if (html_parser_cmp_tag(hsp, "a", 1)) + if (html_parser_cmp_attr(hsp, "href", 4)) + if (html_parser_is_in(hsp, HTML_VALUE_ENDED)) { + html_parser_val(hsp)[html_parser_val_length(hsp)] = '\0'; + printf("%s\n", html_parser_val(hsp)); + } + } + return realsize; +} + +int main(int argc, char *argv[]) +{ + char tag[1], attr[4], val[128]; + CURL *curl; + HTMLSTREAMPARSER *hsp; + + if (argc != 2) { + printf("Usage: %s URL\n", argv[0]); + return EXIT_FAILURE; + } + + curl = curl_easy_init(); + + hsp = html_parser_init(); + + html_parser_set_tag_to_lower(hsp, 1); + html_parser_set_attr_to_lower(hsp, 1); + html_parser_set_tag_buffer(hsp, tag, sizeof(tag)); + html_parser_set_attr_buffer(hsp, attr, sizeof(attr)); + html_parser_set_val_buffer(hsp, val, sizeof(val)-1); + + curl_easy_setopt(curl, CURLOPT_URL, argv[1]); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, hsp); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + + curl_easy_perform(curl); + + curl_easy_cleanup(curl); + + html_parser_cleanup(hsp); + + return EXIT_SUCCESS; +} -- cgit v1.2.3