1 files changed, 321 insertions, 0 deletions
diff --git a/src/palrt/urlpars.cpp b/src/palrt/urlpars.cpp
new file mode 100644
index 0000000000..77b0a6e2ba
--- /dev/null
+++ b/src/palrt/urlpars.cpp
@@ -0,0 +1,321 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+//
+// ===========================================================================
+// File: urlpars.cpp
+//
+// URL APIs ported from shlwapi (especially for Fusion)
+// ===========================================================================
+
+#include "common.h"
+
+#define SLASH       W('/')
+#define WHACK       W('\\')
+
+#define UPF_SCHEME_OPAQUE           0x00000001  //  should not be treated as hierarchical
+#define UPF_SCHEME_INTERNET         0x00000002
+#define UPF_SCHEME_NOHISTORY        0x00000004
+#define UPF_SCHEME_CONVERT          0x00000008  //  treat slashes and whacks as equiv
+#define UPF_SCHEME_DONTCORRECT      0x00000010  //  Don't try to autocorrect to this scheme
+
+PRIVATE CONST WORD isSafe[96] =
+
+/*   Bit 0       alphadigit     -- 'a' to 'z', '0' to '9', 'A' to 'Z'
+**   Bit 1       Hex            -- '0' to '9', 'a' to 'f', 'A' to 'F'
+**   Bit 2       valid scheme   -- alphadigit | "-" | "." | "+"
+**   Bit 3       mark           -- "%" | "$"| "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" | ","
+*/
+/*   0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F */
+    {0, 8, 0, 0, 8, 8, 0, 8, 8, 8, 8, 12, 8,12,12, 0,    /* 2x   !"#$%&'()*+,-./  */
+     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 0, 8, 0, 0,    /* 3x  0123456789:;<=>?  */
+     8, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1,    /* 4x  @ABCDEFGHIJKLMNO  */
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 8,    /* 5X  PQRSTUVWXYZ[\]^_  */
+     0, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1,    /* 6x  `abcdefghijklmno  */
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 8, 0};   /* 7X  pqrstuvwxyz{|}~  DEL */
+
+PRIVATE inline BOOL IsSafe(WCHAR ch, WORD mask)
+{
+    if(((ch > 31 ) && (ch < 128) && (isSafe[ch - 32] & mask)))
+        return TRUE;
+
+    return FALSE;
+}
+
+PRIVATE inline BOOL IsAsciiCharW(WCHAR ch)
+{
+    return (!(ch >> 8) && ((CHAR) ch));
+}
+
+BOOL IsValidSchemeCharW(WCHAR ch)
+{
+    if(IsAsciiCharW(ch))
+        return IsSafe( (CHAR) ch, 5);
+    return FALSE;
+}
+
+
+
+WCHAR const c_szHttpScheme[]           = W("http");
+WCHAR const c_szFileScheme[]           = W("file");
+WCHAR const c_szFTPScheme[]            = W("ftp");
+WCHAR const c_szHttpsScheme[]          = W("https");
+
+const struct
+{
+    LPCWSTR pszScheme;
+    URL_SCHEME eScheme;
+    DWORD cchScheme;
+    DWORD dwFlags;
+} g_mpUrlSchemeTypes[] =
+    {
+    // Because we use a linear search, sort this in the order of
+    // most common usage.
+    { c_szHttpScheme,   URL_SCHEME_HTTP,      SIZECHARS(c_szHttpScheme) - 1,     UPF_SCHEME_INTERNET|UPF_SCHEME_CONVERT},
+    { c_szFileScheme,   URL_SCHEME_FILE,      SIZECHARS(c_szFileScheme) - 1,     UPF_SCHEME_CONVERT},
+    { c_szFTPScheme,    URL_SCHEME_FTP,       SIZECHARS(c_szFTPScheme) - 1,      UPF_SCHEME_INTERNET|UPF_SCHEME_CONVERT},
+    { c_szHttpsScheme,  URL_SCHEME_HTTPS,     SIZECHARS(c_szHttpsScheme) -1,     UPF_SCHEME_INTERNET|UPF_SCHEME_CONVERT|UPF_SCHEME_DONTCORRECT},
+    };
+
+
+/*----------------------------------------------------------
+Purpose: Return the scheme ordinal type (URL_SCHEME_*) based on the
+         URL string.
+
+
+Returns: URL_SCHEME_ ordinal
+Cond:    --
+*/
+
+PRIVATE inline BOOL IsSameSchemeW(LPCWSTR pszLocal, LPCWSTR pszGlobal, DWORD cch)
+{
+    ASSERT(pszLocal);
+    ASSERT(pszGlobal);
+    ASSERT(cch);
+
+    return !StrCmpNIW(pszLocal, pszGlobal, cch);
+}
+
+
+
+PRIVATE URL_SCHEME
+SchemeTypeFromStringW(
+   LPCWSTR psz,
+   DWORD cch)
+{
+   DWORD i;
+
+   // psz is a counted string (by cch), not a null-terminated string,
+   // so use IS_VALID_READ_BUFFER instead of IS_VALID_STRING_PTRW.
+   ASSERT(IS_VALID_READ_BUFFER(psz, WCHAR, cch));
+   ASSERT(cch);
+
+   // We use a linear search.  A binary search wouldn't pay off
+   // because the list isn't big enough, and we can sort the list
+   // according to the most popular protocol schemes and pay off
+   // bigger.
+
+   for (i = 0; i < ARRAYSIZE(g_mpUrlSchemeTypes); i++)
+   {
+       if(cch == g_mpUrlSchemeTypes[i].cchScheme &&
+           IsSameSchemeW(psz, g_mpUrlSchemeTypes[i].pszScheme, cch))
+            return g_mpUrlSchemeTypes[i].eScheme;
+   }
+
+   return URL_SCHEME_UNKNOWN;
+}
+
+inline BOOL IsSeparator(const WCHAR *p)
+{
+    return (*p == SLASH || *p == WHACK );
+}
+
+PRIVATE inline BOOL IsUrlPrefixW(LPCWSTR psz)
+{
+    //
+    // Optimized for this particular case. 
+    //
+    if (psz[0]==L'u' || psz[0]==L'U') {
+        if (psz[1]==L'r' || psz[1]==L'R') {
+            if (psz[2]==L'l' || psz[2]==L'L') {
+                return TRUE;
+            }
+        }
+    }
+    return FALSE;
+    // return !StrCmpNIW(psz, c_szURLPrefixW, c_cchURLPrefix);
+}
+
+//
+//  FindSchemeW() around for Perf reasons for ParseURL()
+//  Any changes in either FindScheme() needs to reflected in the other
+//
+LPCWSTR FindSchemeW(LPCWSTR psz, LPDWORD pcchScheme, BOOL fAllowSemicolon = FALSE)
+{
+    LPCWSTR pch;
+    DWORD cch;
+
+    ASSERT(pcchScheme);
+    ASSERT(psz);
+
+    *pcchScheme = 0;
+
+    for (pch = psz, cch = 0; *pch; pch++, cch++)
+    {
+
+        if (*pch == L':' ||
+
+            // Autocorrect permits a semicolon typo
+            (fAllowSemicolon && *pch == L';'))
+        {
+            if (IsUrlPrefixW(psz))
+            {
+                psz = pch +1;
+
+                //  set pcchScheme to skip past "URL:"
+                *pcchScheme = cch + 1;
+
+                //  reset cch for the scheme len
+                cch = (DWORD) -1;
+                continue;
+            }
+            else
+            {
+                //
+                //  Scheme found if it is at least two characters
+                if(cch > 1)
+                {
+                    *pcchScheme = cch;
+                    return psz;
+                }
+                break;
+            }
+        }
+        if(!IsValidSchemeCharW(*pch))
+            break;
+    }
+
+    return NULL;
+}
+
+PRIVATE DWORD
+CountSlashes(LPCWSTR *ppsz)
+{
+    DWORD cSlashes = 0;
+    LPCWSTR pch = *ppsz;
+
+    while (IsSeparator(pch))
+    {
+        *ppsz = pch;
+        pch++;
+        cSlashes++;
+    }
+
+    return cSlashes;
+}
+
+/*----------------------------------------------------------
+Purpose: Parse the given path into the PARSEDURL structure.
+
+  ******
+  ******  This function must not do any extraneous
+  ******  things.  It must be small and fast.
+  ******
+
+    Returns: NOERROR if a valid URL format
+    URL_E_INVALID_SYNTAX if not
+
+      Cond:    --
+*/
+STDMETHODIMP
+ParseURLW(
+          LPCWSTR pcszURL,
+          PPARSEDURLW ppu)
+{
+    HRESULT hr = E_INVALIDARG;
+
+    RIP(IS_VALID_STRING_PTRW(pcszURL, -1));
+    RIP(IS_VALID_WRITE_PTR(ppu, PARSEDURLW));
+
+    if (pcszURL && ppu && SIZEOF(*ppu) == ppu->cbSize)
+    {
+        DWORD cch;
+        hr = URL_E_INVALID_SYNTAX;      // assume error
+
+        ppu->pszProtocol = FindSchemeW(pcszURL, &cch);
+
+        if(ppu->pszProtocol)
+        {
+            ppu->cchProtocol = cch;
+
+            // Determine protocol scheme number
+            ppu->nScheme = SchemeTypeFromStringW(ppu->pszProtocol, cch);
+
+            ppu->pszSuffix = ppu->pszProtocol + cch + 1;
+
+            //
+            //  APPCOMPAT - Backwards compatibility.  
+            //  ParseURL() believes in file: urls like "file://C:\foo\bar"
+            //  and some pieces of code will use it to get the Dos Path.
+            //  new code should always call PathCreateFromUrl() to
+            //  get the dos path of a file: URL.
+            //
+            //  i am leaving this behavior in case some compat stuff is out there.
+            //
+            if (URL_SCHEME_FILE == ppu->nScheme &&
+                '/' == ppu->pszSuffix[0] && '/' == ppu->pszSuffix[1])
+            {
+                // Yes; skip the "//"
+                ppu->pszSuffix += 2;
+
+#ifndef PLATFORM_UNIX
+                // There might be a third slash.  Skip it.
+                // IEUNIX - On UNIX, it's a root directory, so don't skip it!
+                if ('/' == *ppu->pszSuffix)
+                    ppu->pszSuffix++;
+#endif
+            }
+
+            ppu->cchSuffix = lstrlenW(ppu->pszSuffix);
+
+            hr = S_OK;
+        }
+    }
+
+
+#ifdef DEBUG
+    if (hr==S_OK)
+    {
+        WCHAR rgchDebugProtocol[MAX_PATH_FNAME];
+        WCHAR rgchDebugSuffix[MAX_PATH_FNAME];
+
+        // (+ 1) for null terminator.
+
+        StrCpyNW(rgchDebugProtocol, ppu->pszProtocol,
+            min(ppu->cchProtocol + 1, SIZECHARS(rgchDebugProtocol)));
+
+        // (+ 1) for null terminator.
+
+        StrCpyNW(rgchDebugSuffix, ppu->pszSuffix,
+            min(ppu->cchSuffix + 1, SIZECHARS(rgchDebugSuffix)));
+
+    }
+#endif
+
+    return(hr);
+}
+
+STDAPI_(BOOL) PathIsURLW(IN LPCWSTR pszPath)
+{
+    PARSEDURLW pu;
+
+    if (!pszPath)
+        return FALSE;
+
+    RIPMSG(IS_VALID_STRING_PTR(pszPath, -1), "PathIsURL: caller passed bad pszPath");
+
+    pu.cbSize = SIZEOF(pu);
+    return SUCCEEDED(ParseURLW(pszPath, &pu));
+}