summaryrefslogtreecommitdiff
path: root/src/palrt/urlpars.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/palrt/urlpars.cpp')
-rw-r--r--src/palrt/urlpars.cpp321
1 files changed, 321 insertions, 0 deletions
diff --git a/src/palrt/urlpars.cpp b/src/palrt/urlpars.cpp
new file mode 100644
index 0000000000..77b0a6e2ba
--- /dev/null
+++ b/src/palrt/urlpars.cpp
@@ -0,0 +1,321 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+//
+// ===========================================================================
+// File: urlpars.cpp
+//
+// URL APIs ported from shlwapi (especially for Fusion)
+// ===========================================================================
+
+#include "common.h"
+
+#define SLASH W('/')
+#define WHACK W('\\')
+
+#define UPF_SCHEME_OPAQUE 0x00000001 // should not be treated as hierarchical
+#define UPF_SCHEME_INTERNET 0x00000002
+#define UPF_SCHEME_NOHISTORY 0x00000004
+#define UPF_SCHEME_CONVERT 0x00000008 // treat slashes and whacks as equiv
+#define UPF_SCHEME_DONTCORRECT 0x00000010 // Don't try to autocorrect to this scheme
+
+PRIVATE CONST WORD isSafe[96] =
+
+/* Bit 0 alphadigit -- 'a' to 'z', '0' to '9', 'A' to 'Z'
+** Bit 1 Hex -- '0' to '9', 'a' to 'f', 'A' to 'F'
+** Bit 2 valid scheme -- alphadigit | "-" | "." | "+"
+** Bit 3 mark -- "%" | "$"| "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" | ","
+*/
+/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
+ {0, 8, 0, 0, 8, 8, 0, 8, 8, 8, 8, 12, 8,12,12, 0, /* 2x !"#$%&'()*+,-./ */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 0, 8, 0, 0, /* 3x 0123456789:;<=>? */
+ 8, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x @ABCDEFGHIJKLMNO */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 8, /* 5X PQRSTUVWXYZ[\]^_ */
+ 0, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x `abcdefghijklmno */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 8, 0}; /* 7X pqrstuvwxyz{|}~ DEL */
+
+PRIVATE inline BOOL IsSafe(WCHAR ch, WORD mask)
+{
+ if(((ch > 31 ) && (ch < 128) && (isSafe[ch - 32] & mask)))
+ return TRUE;
+
+ return FALSE;
+}
+
+PRIVATE inline BOOL IsAsciiCharW(WCHAR ch)
+{
+ return (!(ch >> 8) && ((CHAR) ch));
+}
+
+BOOL IsValidSchemeCharW(WCHAR ch)
+{
+ if(IsAsciiCharW(ch))
+ return IsSafe( (CHAR) ch, 5);
+ return FALSE;
+}
+
+
+
+WCHAR const c_szHttpScheme[] = W("http");
+WCHAR const c_szFileScheme[] = W("file");
+WCHAR const c_szFTPScheme[] = W("ftp");
+WCHAR const c_szHttpsScheme[] = W("https");
+
+const struct
+{
+ LPCWSTR pszScheme;
+ URL_SCHEME eScheme;
+ DWORD cchScheme;
+ DWORD dwFlags;
+} g_mpUrlSchemeTypes[] =
+ {
+ // Because we use a linear search, sort this in the order of
+ // most common usage.
+ { c_szHttpScheme, URL_SCHEME_HTTP, SIZECHARS(c_szHttpScheme) - 1, UPF_SCHEME_INTERNET|UPF_SCHEME_CONVERT},
+ { c_szFileScheme, URL_SCHEME_FILE, SIZECHARS(c_szFileScheme) - 1, UPF_SCHEME_CONVERT},
+ { c_szFTPScheme, URL_SCHEME_FTP, SIZECHARS(c_szFTPScheme) - 1, UPF_SCHEME_INTERNET|UPF_SCHEME_CONVERT},
+ { c_szHttpsScheme, URL_SCHEME_HTTPS, SIZECHARS(c_szHttpsScheme) -1, UPF_SCHEME_INTERNET|UPF_SCHEME_CONVERT|UPF_SCHEME_DONTCORRECT},
+ };
+
+
+/*----------------------------------------------------------
+Purpose: Return the scheme ordinal type (URL_SCHEME_*) based on the
+ URL string.
+
+
+Returns: URL_SCHEME_ ordinal
+Cond: --
+*/
+
+PRIVATE inline BOOL IsSameSchemeW(LPCWSTR pszLocal, LPCWSTR pszGlobal, DWORD cch)
+{
+ ASSERT(pszLocal);
+ ASSERT(pszGlobal);
+ ASSERT(cch);
+
+ return !StrCmpNIW(pszLocal, pszGlobal, cch);
+}
+
+
+
+PRIVATE URL_SCHEME
+SchemeTypeFromStringW(
+ LPCWSTR psz,
+ DWORD cch)
+{
+ DWORD i;
+
+ // psz is a counted string (by cch), not a null-terminated string,
+ // so use IS_VALID_READ_BUFFER instead of IS_VALID_STRING_PTRW.
+ ASSERT(IS_VALID_READ_BUFFER(psz, WCHAR, cch));
+ ASSERT(cch);
+
+ // We use a linear search. A binary search wouldn't pay off
+ // because the list isn't big enough, and we can sort the list
+ // according to the most popular protocol schemes and pay off
+ // bigger.
+
+ for (i = 0; i < ARRAYSIZE(g_mpUrlSchemeTypes); i++)
+ {
+ if(cch == g_mpUrlSchemeTypes[i].cchScheme &&
+ IsSameSchemeW(psz, g_mpUrlSchemeTypes[i].pszScheme, cch))
+ return g_mpUrlSchemeTypes[i].eScheme;
+ }
+
+ return URL_SCHEME_UNKNOWN;
+}
+
+inline BOOL IsSeparator(const WCHAR *p)
+{
+ return (*p == SLASH || *p == WHACK );
+}
+
+PRIVATE inline BOOL IsUrlPrefixW(LPCWSTR psz)
+{
+ //
+ // Optimized for this particular case.
+ //
+ if (psz[0]==L'u' || psz[0]==L'U') {
+ if (psz[1]==L'r' || psz[1]==L'R') {
+ if (psz[2]==L'l' || psz[2]==L'L') {
+ return TRUE;
+ }
+ }
+ }
+ return FALSE;
+ // return !StrCmpNIW(psz, c_szURLPrefixW, c_cchURLPrefix);
+}
+
+//
+// FindSchemeW() around for Perf reasons for ParseURL()
+// Any changes in either FindScheme() needs to reflected in the other
+//
+LPCWSTR FindSchemeW(LPCWSTR psz, LPDWORD pcchScheme, BOOL fAllowSemicolon = FALSE)
+{
+ LPCWSTR pch;
+ DWORD cch;
+
+ ASSERT(pcchScheme);
+ ASSERT(psz);
+
+ *pcchScheme = 0;
+
+ for (pch = psz, cch = 0; *pch; pch++, cch++)
+ {
+
+ if (*pch == L':' ||
+
+ // Autocorrect permits a semicolon typo
+ (fAllowSemicolon && *pch == L';'))
+ {
+ if (IsUrlPrefixW(psz))
+ {
+ psz = pch +1;
+
+ // set pcchScheme to skip past "URL:"
+ *pcchScheme = cch + 1;
+
+ // reset cch for the scheme len
+ cch = (DWORD) -1;
+ continue;
+ }
+ else
+ {
+ //
+ // Scheme found if it is at least two characters
+ if(cch > 1)
+ {
+ *pcchScheme = cch;
+ return psz;
+ }
+ break;
+ }
+ }
+ if(!IsValidSchemeCharW(*pch))
+ break;
+ }
+
+ return NULL;
+}
+
+PRIVATE DWORD
+CountSlashes(LPCWSTR *ppsz)
+{
+ DWORD cSlashes = 0;
+ LPCWSTR pch = *ppsz;
+
+ while (IsSeparator(pch))
+ {
+ *ppsz = pch;
+ pch++;
+ cSlashes++;
+ }
+
+ return cSlashes;
+}
+
+/*----------------------------------------------------------
+Purpose: Parse the given path into the PARSEDURL structure.
+
+ ******
+ ****** This function must not do any extraneous
+ ****** things. It must be small and fast.
+ ******
+
+ Returns: NOERROR if a valid URL format
+ URL_E_INVALID_SYNTAX if not
+
+ Cond: --
+*/
+STDMETHODIMP
+ParseURLW(
+ LPCWSTR pcszURL,
+ PPARSEDURLW ppu)
+{
+ HRESULT hr = E_INVALIDARG;
+
+ RIP(IS_VALID_STRING_PTRW(pcszURL, -1));
+ RIP(IS_VALID_WRITE_PTR(ppu, PARSEDURLW));
+
+ if (pcszURL && ppu && SIZEOF(*ppu) == ppu->cbSize)
+ {
+ DWORD cch;
+ hr = URL_E_INVALID_SYNTAX; // assume error
+
+ ppu->pszProtocol = FindSchemeW(pcszURL, &cch);
+
+ if(ppu->pszProtocol)
+ {
+ ppu->cchProtocol = cch;
+
+ // Determine protocol scheme number
+ ppu->nScheme = SchemeTypeFromStringW(ppu->pszProtocol, cch);
+
+ ppu->pszSuffix = ppu->pszProtocol + cch + 1;
+
+ //
+ // APPCOMPAT - Backwards compatibility.
+ // ParseURL() believes in file: urls like "file://C:\foo\bar"
+ // and some pieces of code will use it to get the Dos Path.
+ // new code should always call PathCreateFromUrl() to
+ // get the dos path of a file: URL.
+ //
+ // i am leaving this behavior in case some compat stuff is out there.
+ //
+ if (URL_SCHEME_FILE == ppu->nScheme &&
+ '/' == ppu->pszSuffix[0] && '/' == ppu->pszSuffix[1])
+ {
+ // Yes; skip the "//"
+ ppu->pszSuffix += 2;
+
+#ifndef PLATFORM_UNIX
+ // There might be a third slash. Skip it.
+ // IEUNIX - On UNIX, it's a root directory, so don't skip it!
+ if ('/' == *ppu->pszSuffix)
+ ppu->pszSuffix++;
+#endif
+ }
+
+ ppu->cchSuffix = lstrlenW(ppu->pszSuffix);
+
+ hr = S_OK;
+ }
+ }
+
+
+#ifdef DEBUG
+ if (hr==S_OK)
+ {
+ WCHAR rgchDebugProtocol[MAX_PATH_FNAME];
+ WCHAR rgchDebugSuffix[MAX_PATH_FNAME];
+
+ // (+ 1) for null terminator.
+
+ StrCpyNW(rgchDebugProtocol, ppu->pszProtocol,
+ min(ppu->cchProtocol + 1, SIZECHARS(rgchDebugProtocol)));
+
+ // (+ 1) for null terminator.
+
+ StrCpyNW(rgchDebugSuffix, ppu->pszSuffix,
+ min(ppu->cchSuffix + 1, SIZECHARS(rgchDebugSuffix)));
+
+ }
+#endif
+
+ return(hr);
+}
+
+STDAPI_(BOOL) PathIsURLW(IN LPCWSTR pszPath)
+{
+ PARSEDURLW pu;
+
+ if (!pszPath)
+ return FALSE;
+
+ RIPMSG(IS_VALID_STRING_PTR(pszPath, -1), "PathIsURL: caller passed bad pszPath");
+
+ pu.cbSize = SIZEOF(pu);
+ return SUCCEEDED(ParseURLW(pszPath, &pu));
+}