// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for full license information. // // ============================================================ // // StringLexer.inl // // // Implements the inlined methods of StringLexer class // // ============================================================ #ifndef __BINDER__STRING_LEXER_INL__ #define __BINDER__STRING_LEXER_INL__ StringLexer::StringLexer() { m_wcCurrentChar = INVALID_CHARACTER; m_fCurrentCharIsEscaped = FALSE; } StringLexer::~StringLexer() { // Nothing to do here } void StringLexer::Init(SString &inputString, BOOL fSupportEscaping) { m_cursor = inputString.Begin(); m_end = inputString.End(); m_fSupportEscaping = fSupportEscaping; m_fReadRawCharacter = FALSE; } BOOL StringLexer::IsWhitespace(WCHAR wcChar) { return ((wcChar == L'\n') || (wcChar == L'\r') || (wcChar == L' ') || (wcChar == L'\t')); } BOOL StringLexer::IsEOS(WCHAR wcChar) { return (wcChar == 0); } BOOL StringLexer::IsQuoteCharacter(WCHAR wcChar) { return ((wcChar == L'\'') || (wcChar == L'"')); } WCHAR StringLexer::PopCharacter(BOOL *pfIsEscaped) { WCHAR wcCurrentChar = m_wcCurrentChar; BINDER_LOG_ENTER(L"StringLexer::PopCharacter"); if (wcCurrentChar != INVALID_CHARACTER) { BINDER_LOG(L"HAVE wcCurrentChar"); m_wcCurrentChar = INVALID_CHARACTER; *pfIsEscaped = m_fCurrentCharIsEscaped; } else { BINDER_LOG(L"GET wcCurrentChar"); wcCurrentChar = GetNextCharacter(pfIsEscaped); } #ifdef BINDER_DEBUG_LOG PathString info; info.Printf(L"wcCurrentChar=%p", (void *) wcCurrentChar); BINDER_LOG((WCHAR *) info.GetUnicode()); #endif BINDER_LOG_LEAVE(L"StringLexer::PopCharacter"); return wcCurrentChar; } void StringLexer::PushCharacter(WCHAR wcCurrentChar, BOOL fIsEscaped) { BINDER_LOG_ENTER(L"StringLexer::PushCharacter"); #ifdef BINDER_DEBUG_LOG PathString info; info.Printf(L"wcCurrentChar=%p, fIsEscaped=%d", (void *) wcCurrentChar, fIsEscaped); BINDER_LOG((WCHAR *) info.GetUnicode()); #endif _ASSERTE(m_wcCurrentChar == INVALID_CHARACTER); m_wcCurrentChar = wcCurrentChar; m_fCurrentCharIsEscaped = fIsEscaped; BINDER_LOG_LEAVE(L"StringLexer::PushCharacter"); } WCHAR StringLexer::GetRawCharacter() { WCHAR wcCurrentChar = 0; if (m_cursor <= m_end) { wcCurrentChar = m_cursor[0]; m_fReadRawCharacter = TRUE; m_cursor++; } else { m_fReadRawCharacter = FALSE; } return wcCurrentChar; } void StringLexer::PushRawCharacter() { if (m_fReadRawCharacter) { m_cursor--; m_fReadRawCharacter = FALSE; } } WCHAR StringLexer::DecodeUTF16Character() { // See http://www.ietf.org/rfc/rfc2781.txt for details on UTF-16 encoding. WCHAR wcCurrentChar = 0; SIZE_T nCharacters = m_end - m_cursor + 1; WCHAR wcChar1 = GetRawCharacter(); if (wcChar1 < 0xd800) { wcCurrentChar = wcChar1; } else { // StringLexer is not designed to handle UTF-16 characters beyond the Basic Multilingual Plane, // since it stores all characters in 16-bit WCHARs. // However, since the vast majority of the time, we (Microsoft) produce the manifests, // this is likely a non-scenario, as the other Unicode planes would never be used in practice. if (wcChar1 <= 0xdbff) // 0xd800 - 0xdbff indicates the first WCHAR of a surrogate pair { if (nCharacters >= 2) { GetRawCharacter(); // Skip the second WCHAR of the surrogate pair } } // Otherwise, the character is either in the 0xdc00 - 0xdfff range, indicating the second WCHAR of a surrogate pair, // or in the 0xE000 - 0xFFFF range, which has within it ranges of invalid characters, and which we conservatively treat // as invalid. wcCurrentChar = INVALID_CHARACTER; } return wcCurrentChar; } WCHAR StringLexer::GetNextCharacter(BOOL *pfIsEscaped) { *pfIsEscaped = FALSE; WCHAR wcCurrentChar = GetRawCharacter(); // DecodeUTF16Character() if (wcCurrentChar == L'\\') { WCHAR wcTempChar = GetRawCharacter(); // DecodeUTF16Character() if (m_fSupportEscaping) { // Handle standard escapes switch (wcTempChar) { case L'"': case L'\'': case L',': case L'\\': case L'/': case L'=': break; case L't': wcTempChar = 9; break; case L'n': wcTempChar = 10; break; case L'r': wcTempChar = 13; break; case L'u': wcTempChar = ParseUnicode(); break; default: return INVALID_CHARACTER; } *pfIsEscaped = TRUE; wcCurrentChar = wcTempChar; } else { // Do not handle escapes except for quotes switch (wcTempChar) { case L'"': case L'\'': *pfIsEscaped = TRUE; wcCurrentChar = wcTempChar; break; default: PushRawCharacter(); break; } } } return wcCurrentChar; } WCHAR StringLexer::ParseUnicode() { int nCharacters = 0; WCHAR wcUnicodeChar = 0; for(;;) { WCHAR wcCurrentChar = DecodeUTF16Character(); nCharacters++; if (wcCurrentChar == L';') { break; } else if ((wcCurrentChar == INVALID_CHARACTER) || (nCharacters >= 9)) { return INVALID_CHARACTER; } wcUnicodeChar <<= 4; if ((wcCurrentChar >= L'0') && (wcCurrentChar <= L'9')) { wcUnicodeChar += (wcCurrentChar - L'0'); } else if ((wcCurrentChar >= L'a') && (wcCurrentChar <= L'f')) { wcUnicodeChar += (wcCurrentChar - L'a') + 10; } else if ((wcCurrentChar >= L'A') && (wcCurrentChar <= L'F')) { wcUnicodeChar += (wcCurrentChar - L'A') + 10; } else { return INVALID_CHARACTER; } } return wcUnicodeChar; } #endif