From 7877619f856a04af0519e92780b1d6674a8ff3f7 Mon Sep 17 00:00:00 2001
From: Daniel Stenberg <daniel@haxx.se>
Date: Sat, 15 Jun 2013 23:47:02 +0200
Subject: dotdot: introducing dot file path cleanup

RFC3986 details how a path part passed in as part of a URI should be
"cleaned" from dot sequences before getting used. The described
algorithm is now implemented in lib/dotdot.c with the accompanied test
case in test 1395.

Bug: http://curl.haxx.se/bug/view.cgi?id=1200
Reported-by: Alex Vinnik
---
 lib/Makefile.inc |   4 +-
 lib/dotdot.c     | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/dotdot.h     |  25 ++++++++
 lib/url.c        |  36 ++++++++----
 4 files changed, 223 insertions(+), 12 deletions(-)
 create mode 100644 lib/dotdot.c
 create mode 100644 lib/dotdot.h

(limited to 'lib')

diff --git a/lib/Makefile.inc b/lib/Makefile.inc
index f76e1ec83..4228bf6b8 100644
--- a/lib/Makefile.inc
+++ b/lib/Makefile.inc
@@ -25,7 +25,7 @@ CSOURCES = file.c timeval.c base64.c hostip.c progress.c formdata.c	\
   http_proxy.c non-ascii.c asyn-ares.c asyn-thread.c curl_gssapi.c	\
   curl_ntlm.c curl_ntlm_wb.c curl_ntlm_core.c curl_ntlm_msgs.c		\
   curl_sasl.c curl_schannel.c curl_multibyte.c curl_darwinssl.c		\
-  hostcheck.c bundles.c conncache.c pipeline.c
+  hostcheck.c bundles.c conncache.c pipeline.c dotdot.c
 
 HHEADERS = arpa_telnet.h netrc.h file.h timeval.h qssl.h hostip.h	\
   progress.h formdata.h cookie.h http.h sendf.h ftp.h url.h dict.h	\
@@ -44,4 +44,4 @@ HHEADERS = arpa_telnet.h netrc.h file.h timeval.h qssl.h hostip.h	\
   asyn.h curl_ntlm.h curl_gssapi.h curl_ntlm_wb.h curl_ntlm_core.h	\
   curl_ntlm_msgs.h curl_sasl.h curl_schannel.h curl_multibyte.h		\
   curl_darwinssl.h hostcheck.h bundles.h conncache.h curl_setup_once.h	\
-  multihandle.h setup-vms.h pipeline.h
+  multihandle.h setup-vms.h pipeline.h dotdot.h
diff --git a/lib/dotdot.c b/lib/dotdot.c
new file mode 100644
index 000000000..95b636780
--- /dev/null
+++ b/lib/dotdot.c
@@ -0,0 +1,170 @@
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2013, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+#include "curl_setup.h"
+
+#include "dotdot.h"
+
+#include "curl_memory.h"
+/* The last #include file should be: */
+#include "memdebug.h"
+
+/*
+ * "Remove Dot Segments"
+ * http://tools.ietf.org/html/rfc3986#section-5.2.4
+ */
+
+/*
+ * Curl_dedotdotify()
+ *
+ * This function gets a zero-terminated path with dot and dotdot sequences
+ * passed in and strips them off according to the rules in RFC 3986 section
+ * 5.2.5.
+ *
+ * The function handles a query part ('?' + stuff) appended but it expects
+ * that fragments ('#' + stuff) have already been cut off.
+ *
+ * RETURNS
+ *
+ * an allocated dedotdotified output string
+ */
+char *Curl_dedotdotify(char *input)
+{
+  size_t inlen = strlen(input);
+  char *clone;
+  size_t clen = inlen; /* the length of the cloned input */
+  char *out = malloc(inlen+1);
+  char *outp;
+  char *orgclone;
+  char *queryp;
+  if(!out)
+    return NULL; /* out of memory */
+
+  /* get a cloned copy of the input */
+  clone = strdup(input);
+  if(!clone) {
+    free(out);
+    return NULL;
+  }
+  orgclone = clone;
+  outp = out;
+
+  /*
+   * To handle query-parts properly, we must find it and remove it during the
+   * dotdot-operation and then append it again at the end to the output
+   * string.
+   */
+  queryp = strchr(clone, '?');
+  if(queryp)
+    *queryp = 0;
+
+  do {
+
+    /*  A.  If the input buffer begins with a prefix of "../" or "./", then
+        remove that prefix from the input buffer; otherwise, */
+
+    if(!strncmp("./", clone, 2)) {
+      clone+=2;
+      clen-=2;
+    }
+    else if(!strncmp("../", clone, 3)) {
+      clone+=3;
+      clen-=3;
+    }
+
+    /*  B.  if the input buffer begins with a prefix of "/./" or "/.", where
+        "."  is a complete path segment, then replace that prefix with "/" in
+        the input buffer; otherwise, */
+    else if(!strncmp("/./", clone, 3)) {
+      clone+=2;
+      clen-=2;
+    }
+    else if(!strcmp("/.", clone)) {
+      clone[1]='/';
+      clone++;
+      clen-=1;
+    }
+
+    /*  C.  if the input buffer begins with a prefix of "/../" or "/..", where
+        ".." is a complete path segment, then replace that prefix with "/" in
+        the input buffer and remove the last segment and its preceding "/" (if
+        any) from the output buffer; otherwise, */
+
+    else if(!strncmp("/../", clone, 4)) {
+      clone+=3;
+      clen-=3;
+      /* remove the last segment from the output buffer */
+      while(outp > out) {
+        outp--;
+        if(*outp == '/')
+          break;
+      }
+      *outp = 0; /* zero-terminate where it stops */
+    }
+    else if(!strcmp("/..", clone)) {
+      clone[2]='/';
+      clone+=2;
+      clen-=2;
+      /* remove the last segment from the output buffer */
+      while(outp > out) {
+        outp--;
+        if(*outp == '/')
+          break;
+      }
+      *outp = 0; /* zero-terminate where it stops */
+    }
+
+    /*  D.  if the input buffer consists only of "." or "..", then remove
+        that from the input buffer; otherwise, */
+
+    else if(!strcmp(".", clone) || !strcmp("..", clone)) {
+      *clone=0;
+    }
+
+    else {
+      /*  E.  move the first path segment in the input buffer to the end of
+          the output buffer, including the initial "/" character (if any) and
+          any subsequent characters up to, but not including, the next "/"
+          character or the end of the input buffer. */
+
+      do {
+        *outp++ = *clone++;
+        clen--;
+      } while(*clone && (*clone != '/'));
+      *outp=0;
+    }
+
+  } while(*clone);
+
+  if(queryp) {
+    size_t qlen;
+    /* There was a query part, append that to the output. The 'clone' string
+       may now have been altered so we copy from the original input string
+       from the correct index. */
+    size_t oindex = queryp - orgclone;
+    qlen = strlen(&input[oindex]);
+    memcpy(outp, &input[oindex], qlen+1); /* include the ending zero byte */
+  }
+
+  free(orgclone);
+  return out;
+}
diff --git a/lib/dotdot.h b/lib/dotdot.h
new file mode 100644
index 000000000..c3487c137
--- /dev/null
+++ b/lib/dotdot.h
@@ -0,0 +1,25 @@
+#ifndef HEADER_CURL_DOTDOT_H
+#define HEADER_CURL_DOTDOT_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2013, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+char *Curl_dedotdotify(char *input);
+#endif
diff --git a/lib/url.c b/lib/url.c
index 7ba496986..e1c9dffe2 100644
--- a/lib/url.c
+++ b/lib/url.c
@@ -124,6 +124,7 @@ int curl_win32_idn_to_ascii(const char *in, char **out);
 #include "conncache.h"
 #include "multihandle.h"
 #include "pipeline.h"
+#include "dotdot.h"
 
 #define _MPRINTF_REPLACE /* use our functions only */
 #include <curl/mprintf.h>
@@ -3674,7 +3675,7 @@ static CURLcode parseurlandfillconn(struct SessionHandle *data,
   char protobuf[16];
   const char *protop;
   CURLcode result;
-  bool fix_slash = FALSE;
+  bool rebuild_url = FALSE;
 
   *prot_missing = FALSE;
 
@@ -3825,14 +3826,14 @@ static CURLcode parseurlandfillconn(struct SessionHandle *data,
     memcpy(path+1, query, hostlen);
 
     path[0]='/'; /* prepend the missing slash */
-    fix_slash = TRUE;
+    rebuild_url = TRUE;
 
     *query=0; /* now cut off the hostname at the ? */
   }
   else if(!path[0]) {
     /* if there's no path set, use a single slash */
     strcpy(path, "/");
-    fix_slash = TRUE;
+    rebuild_url = TRUE;
   }
 
   /* If the URL is malformatted (missing a '/' after hostname before path) we
@@ -3845,17 +3846,30 @@ static CURLcode parseurlandfillconn(struct SessionHandle *data,
        is bigger than the path. Use +1 to move the zero byte too. */
     memmove(&path[1], path, strlen(path)+1);
     path[0] = '/';
-    fix_slash = TRUE;
+    rebuild_url = TRUE;
+  }
+  else {
+    /* sanitise paths and remove ../ and ./ sequences according to RFC3986 */
+    char *newp = Curl_dedotdotify(path);
+
+    if(strcmp(newp, path)) {
+      rebuild_url = TRUE;
+      free(data->state.pathbuffer);
+      data->state.pathbuffer = newp;
+      data->state.path = newp;
+      path = newp;
+    }
+    else
+      free(newp);
   }
-
 
   /*
-   * "fix_slash" means that the URL was malformatted so we need to generate an
-   * updated version with the new slash inserted at the right place!  We need
-   * the corrected URL when communicating over HTTP proxy and we don't know at
-   * this point if we're using a proxy or not.
+   * "rebuild_url" means that one or more URL components have been modified so
+   * we need to generate an updated full version.  We need the corrected URL
+   * when communicating over HTTP proxy and we don't know at this point if
+   * we're using a proxy or not.
    */
-  if(fix_slash) {
+  if(rebuild_url) {
     char *reurl;
 
     size_t plen = strlen(path); /* new path, should be 1 byte longer than
@@ -3878,6 +3892,8 @@ static CURLcode parseurlandfillconn(struct SessionHandle *data,
       data->change.url_alloc = FALSE;
     }
 
+    infof(data, "Rebuilt URL to: %s\n", reurl);
+
     data->change.url = reurl;
     data->change.url_alloc = TRUE; /* free this later */
   }
-- 
cgit v1.2.3