// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
//

//
// ===========================================================================
// File: urlpars.cpp
//
// URL APIs ported from shlwapi (especially for Fusion)
// ===========================================================================

#include "common.h"
#include "strsafe.h"

#define SLASH       W('/')
#define WHACK       W('\\')

#define UPF_SCHEME_OPAQUE           0x00000001  //  should not be treated as hierarchical
#define UPF_SCHEME_INTERNET         0x00000002
#define UPF_SCHEME_NOHISTORY        0x00000004
#define UPF_SCHEME_CONVERT          0x00000008  //  treat slashes and whacks as equiv
#define UPF_SCHEME_DONTCORRECT      0x00000010  //  Don't try to autocorrect to this scheme

PRIVATE CONST WORD isSafe[96] =

/*   Bit 0       alphadigit     -- 'a' to 'z', '0' to '9', 'A' to 'Z'
**   Bit 1       Hex            -- '0' to '9', 'a' to 'f', 'A' to 'F'
**   Bit 2       valid scheme   -- alphadigit | "-" | "." | "+"
**   Bit 3       mark           -- "%" | "$"| "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" | ","
*/
/*   0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F */
    {0, 8, 0, 0, 8, 8, 0, 8, 8, 8, 8, 12, 8,12,12, 0,    /* 2x   !"#$%&'()*+,-./  */
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 0, 8, 0, 0,    /* 3x  0123456789:;<=>?  */
     8, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1,    /* 4x  @ABCDEFGHIJKLMNO  */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 8,    /* 5X  PQRSTUVWXYZ[\]^_  */
     0, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1,    /* 6x  `abcdefghijklmno  */
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 8, 0};   /* 7X  pqrstuvwxyz{|}~  DEL */

PRIVATE inline BOOL IsSafe(WCHAR ch, WORD mask)
{
    if(((ch > 31 ) && (ch < 128) && (isSafe[ch - 32] & mask)))
        return TRUE;

    return FALSE;
}

PRIVATE inline BOOL IsAsciiCharW(WCHAR ch)
{
    return (!(ch >> 8) && ((CHAR) ch));
}

BOOL IsValidSchemeCharW(WCHAR ch)
{
    if(IsAsciiCharW(ch))
        return IsSafe( (CHAR) ch, 5);
    return FALSE;
}


WCHAR const c_szHttpScheme[]           = W("http");
WCHAR const c_szFileScheme[]           = W("file");
WCHAR const c_szFTPScheme[]            = W("ftp");
WCHAR const c_szHttpsScheme[]          = W("https");

const struct
{
    LPCWSTR pszScheme;
    URL_SCHEME eScheme;
    DWORD cchScheme;
    DWORD dwFlags;
} g_mpUrlSchemeTypes[] =
    {
    // Because we use a linear search, sort this in the order of
    // most common usage.
    { c_szHttpScheme,   URL_SCHEME_HTTP,      SIZECHARS(c_szHttpScheme) - 1,     UPF_SCHEME_INTERNET|UPF_SCHEME_CONVERT},
    { c_szFileScheme,   URL_SCHEME_FILE,      SIZECHARS(c_szFileScheme) - 1,     UPF_SCHEME_CONVERT},
    { c_szFTPScheme,    URL_SCHEME_FTP,       SIZECHARS(c_szFTPScheme) - 1,      UPF_SCHEME_INTERNET|UPF_SCHEME_CONVERT},
    { c_szHttpsScheme,  URL_SCHEME_HTTPS,     SIZECHARS(c_szHttpsScheme) -1,     UPF_SCHEME_INTERNET|UPF_SCHEME_CONVERT|UPF_SCHEME_DONTCORRECT},
    };


/*----------------------------------------------------------
Purpose: Return the scheme ordinal type (URL_SCHEME_*) based on the
         URL string.


Returns: URL_SCHEME_ ordinal
Cond:    --
*/

PRIVATE inline BOOL IsSameSchemeW(LPCWSTR pszLocal, LPCWSTR pszGlobal, DWORD cch)
{
    ASSERT(pszLocal);
    ASSERT(pszGlobal);
    ASSERT(cch);

    return !StrCmpNIW(pszLocal, pszGlobal, cch);
}


PRIVATE URL_SCHEME
SchemeTypeFromStringW(
   LPCWSTR psz,
   DWORD cch)
{
   DWORD i;

   // psz is a counted string (by cch), not a null-terminated string,
   // so use IS_VALID_READ_BUFFER instead of IS_VALID_STRING_PTRW.
   ASSERT(IS_VALID_READ_BUFFER(psz, WCHAR, cch));
   ASSERT(cch);

   // We use a linear search.  A binary search wouldn't pay off
   // because the list isn't big enough, and we can sort the list
   // according to the most popular protocol schemes and pay off
   // bigger.

   for (i = 0; i < ARRAYSIZE(g_mpUrlSchemeTypes); i++)
   {
       if(cch == g_mpUrlSchemeTypes[i].cchScheme &&
           IsSameSchemeW(psz, g_mpUrlSchemeTypes[i].pszScheme, cch))
            return g_mpUrlSchemeTypes[i].eScheme;
   }

   return URL_SCHEME_UNKNOWN;
}

inline BOOL IsSeparator(const WCHAR *p)
{
    return (*p == SLASH || *p == WHACK );
}

PRIVATE inline BOOL IsUrlPrefixW(LPCWSTR psz)
{
    //
    // Optimized for this particular case. 
    //
    if (psz[0]==L'u' || psz[0]==L'U') {
        if (psz[1]==L'r' || psz[1]==L'R') {
            if (psz[2]==L'l' || psz[2]==L'L') {
                return TRUE;
            }
        }
    }
    return FALSE;
    // return !StrCmpNIW(psz, c_szURLPrefixW, c_cchURLPrefix);
}

//
//  FindSchemeW() around for Perf reasons for ParseURL()
//  Any changes in either FindScheme() needs to reflected in the other
//
LPCWSTR FindSchemeW(LPCWSTR psz, LPDWORD pcchScheme, BOOL fAllowSemicolon = FALSE)
{
    LPCWSTR pch;
    DWORD cch;

    ASSERT(pcchScheme);
    ASSERT(psz);

    *pcchScheme = 0;

    for (pch = psz, cch = 0; *pch; pch++, cch++)
    {

        if (*pch == L':' ||

            // Autocorrect permits a semicolon typo
            (fAllowSemicolon && *pch == L';'))
        {
            if (IsUrlPrefixW(psz))
            {
                psz = pch +1;

                //  set pcchScheme to skip past "URL:"
                *pcchScheme = cch + 1;

                //  reset cch for the scheme len
                cch = (DWORD) -1;
                continue;
            }
            else
            {
                //
                //  Scheme found if it is at least two characters
                if(cch > 1)
                {
                    *pcchScheme = cch;
                    return psz;
                }
                break;
            }
        }
        if(!IsValidSchemeCharW(*pch))
            break;
    }

    return NULL;
}

PRIVATE DWORD
CountSlashes(LPCWSTR *ppsz)
{
    DWORD cSlashes = 0;
    LPCWSTR pch = *ppsz;

    while (IsSeparator(pch))
    {
        *ppsz = pch;
        pch++;
        cSlashes++;
    }

    return cSlashes;
}

/*----------------------------------------------------------
Purpose: Parse the given path into the PARSEDURL structure.

  ******
  ******  This function must not do any extraneous
  ******  things.  It must be small and fast.
  ******

    Returns: NOERROR if a valid URL format
    URL_E_INVALID_SYNTAX if not

      Cond:    --
*/
STDMETHODIMP
ParseURLW(
          LPCWSTR pcszURL,
          PPARSEDURLW ppu)
{
    HRESULT hr = E_INVALIDARG;

    RIP(IS_VALID_STRING_PTRW(pcszURL, -1));
    RIP(IS_VALID_WRITE_PTR(ppu, PARSEDURLW));

    if (pcszURL && ppu && SIZEOF(*ppu) == ppu->cbSize)
    {
        DWORD cch;
        hr = URL_E_INVALID_SYNTAX;      // assume error

        ppu->pszProtocol = FindSchemeW(pcszURL, &cch);

        if(ppu->pszProtocol)
        {
            ppu->cchProtocol = cch;

            // Determine protocol scheme number
            ppu->nScheme = SchemeTypeFromStringW(ppu->pszProtocol, cch);

            ppu->pszSuffix = ppu->pszProtocol + cch + 1;

            //
            //  APPCOMPAT - Backwards compatibility.  
            //  ParseURL() believes in file: urls like "file://C:\foo\bar"
            //  and some pieces of code will use it to get the Dos Path.
            //  new code should always call PathCreateFromUrl() to
            //  get the dos path of a file: URL.
            //
            //  i am leaving this behavior in case some compat stuff is out there.
            //
            if (URL_SCHEME_FILE == ppu->nScheme &&
                '/' == ppu->pszSuffix[0] && '/' == ppu->pszSuffix[1])
            {
                // Yes; skip the "//"
                ppu->pszSuffix += 2;

#ifndef PLATFORM_UNIX
                // There might be a third slash.  Skip it.
                // IEUNIX - On UNIX, it's a root directory, so don't skip it!
                if ('/' == *ppu->pszSuffix)
                    ppu->pszSuffix++;
#endif
            }

            ppu->cchSuffix = lstrlenW(ppu->pszSuffix);

            hr = S_OK;
        }
    }


#ifdef DEBUG
    if (hr==S_OK)
    {
        WCHAR rgchDebugProtocol[MAX_PATH_FNAME];
        WCHAR rgchDebugSuffix[MAX_PATH_FNAME];

        // (+ 1) for null terminator.

        StringCchCopyNW(rgchDebugProtocol, ARRAYSIZE(rgchDebugProtocol), ppu->pszProtocol,
            min(ppu->cchProtocol + 1, SIZECHARS(rgchDebugProtocol)));

        // (+ 1) for null terminator.

        StringCchCopyNW(rgchDebugSuffix, ARRAYSIZE(rgchDebugSuffix), ppu->pszSuffix,
            min(ppu->cchSuffix + 1, SIZECHARS(rgchDebugSuffix)));
    }
#endif

    return(hr);
}

STDAPI_(BOOL) PathIsURLW(IN LPCWSTR pszPath)
{
    PARSEDURLW pu;

    if (!pszPath)
        return FALSE;

    RIPMSG(IS_VALID_STRING_PTR(pszPath, -1), "PathIsURL: caller passed bad pszPath");

    pu.cbSize = SIZEOF(pu);
    return SUCCEEDED(ParseURLW(pszPath, &pu));
}