summaryrefslogtreecommitdiff
path: root/src/binder/inc/stringlexer.inl
diff options
context:
space:
mode:
Diffstat (limited to 'src/binder/inc/stringlexer.inl')
-rw-r--r--src/binder/inc/stringlexer.inl266
1 files changed, 266 insertions, 0 deletions
diff --git a/src/binder/inc/stringlexer.inl b/src/binder/inc/stringlexer.inl
new file mode 100644
index 0000000000..f8bc3816a4
--- /dev/null
+++ b/src/binder/inc/stringlexer.inl
@@ -0,0 +1,266 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+// ============================================================
+//
+// StringLexer.inl
+//
+
+
+//
+// Implements the inlined methods of StringLexer class
+//
+// ============================================================
+
+#ifndef __BINDER__STRING_LEXER_INL__
+#define __BINDER__STRING_LEXER_INL__
+
+StringLexer::StringLexer()
+{
+ m_wcCurrentChar = INVALID_CHARACTER;
+ m_fCurrentCharIsEscaped = FALSE;
+}
+
+StringLexer::~StringLexer()
+{
+ // Nothing to do here
+}
+
+void StringLexer::Init(SString &inputString, BOOL fSupportEscaping)
+{
+ m_cursor = inputString.Begin();
+ m_end = inputString.End();
+ m_fSupportEscaping = fSupportEscaping;
+ m_fReadRawCharacter = FALSE;
+}
+
+BOOL StringLexer::IsWhitespace(WCHAR wcChar)
+{
+ return ((wcChar == L'\n') || (wcChar == L'\r') || (wcChar == L' ') || (wcChar == L'\t'));
+}
+
+BOOL StringLexer::IsEOS(WCHAR wcChar)
+{
+ return (wcChar == 0);
+}
+
+BOOL StringLexer::IsQuoteCharacter(WCHAR wcChar)
+{
+ return ((wcChar == L'\'') || (wcChar == L'"'));
+}
+
+WCHAR StringLexer::PopCharacter(BOOL *pfIsEscaped)
+{
+ WCHAR wcCurrentChar = m_wcCurrentChar;
+ BINDER_LOG_ENTER(L"StringLexer::PopCharacter");
+
+ if (wcCurrentChar != INVALID_CHARACTER)
+ {
+ BINDER_LOG(L"HAVE wcCurrentChar");
+ m_wcCurrentChar = INVALID_CHARACTER;
+ *pfIsEscaped = m_fCurrentCharIsEscaped;
+ }
+ else
+ {
+ BINDER_LOG(L"GET wcCurrentChar");
+ wcCurrentChar = GetNextCharacter(pfIsEscaped);
+ }
+
+#ifdef BINDER_DEBUG_LOG
+ PathString info;
+
+ info.Printf(L"wcCurrentChar=%p", (void *) wcCurrentChar);
+ BINDER_LOG((WCHAR *) info.GetUnicode());
+#endif
+
+ BINDER_LOG_LEAVE(L"StringLexer::PopCharacter");
+ return wcCurrentChar;
+}
+
+void StringLexer::PushCharacter(WCHAR wcCurrentChar,
+ BOOL fIsEscaped)
+{
+ BINDER_LOG_ENTER(L"StringLexer::PushCharacter");
+
+#ifdef BINDER_DEBUG_LOG
+ PathString info;
+
+ info.Printf(L"wcCurrentChar=%p, fIsEscaped=%d", (void *) wcCurrentChar, fIsEscaped);
+ BINDER_LOG((WCHAR *) info.GetUnicode());
+#endif
+
+ _ASSERTE(m_wcCurrentChar == INVALID_CHARACTER);
+
+ m_wcCurrentChar = wcCurrentChar;
+ m_fCurrentCharIsEscaped = fIsEscaped;
+
+ BINDER_LOG_LEAVE(L"StringLexer::PushCharacter");
+}
+
+WCHAR StringLexer::GetRawCharacter()
+{
+ WCHAR wcCurrentChar = 0;
+
+ if (m_cursor <= m_end)
+ {
+ wcCurrentChar = m_cursor[0];
+ m_fReadRawCharacter = TRUE;
+ m_cursor++;
+ }
+ else
+ {
+ m_fReadRawCharacter = FALSE;
+ }
+
+ return wcCurrentChar;
+}
+
+void StringLexer::PushRawCharacter()
+{
+ if (m_fReadRawCharacter)
+ {
+ m_cursor--;
+ m_fReadRawCharacter = FALSE;
+ }
+}
+
+WCHAR StringLexer::DecodeUTF16Character()
+{
+ // See http://www.ietf.org/rfc/rfc2781.txt for details on UTF-16 encoding.
+
+ WCHAR wcCurrentChar = 0;
+ SIZE_T nCharacters = m_end - m_cursor + 1;
+ WCHAR wcChar1 = GetRawCharacter();
+
+ if (wcChar1 < 0xd800)
+ {
+ wcCurrentChar = wcChar1;
+ }
+ else
+ {
+ // StringLexer is not designed to handle UTF-16 characters beyond the Basic Multilingual Plane,
+ // since it stores all characters in 16-bit WCHARs.
+ // However, since the vast majority of the time, we (Microsoft) produce the manifests,
+ // this is likely a non-scenario, as the other Unicode planes would never be used in practice.
+
+ if (wcChar1 <= 0xdbff) // 0xd800 - 0xdbff indicates the first WCHAR of a surrogate pair
+ {
+ if (nCharacters >= 2)
+ {
+ GetRawCharacter(); // Skip the second WCHAR of the surrogate pair
+ }
+ }
+ // Otherwise, the character is either in the 0xdc00 - 0xdfff range, indicating the second WCHAR of a surrogate pair,
+ // or in the 0xE000 - 0xFFFF range, which has within it ranges of invalid characters, and which we conservatively treat
+ // as invalid.
+
+ wcCurrentChar = INVALID_CHARACTER;
+ }
+
+ return wcCurrentChar;
+}
+
+
+WCHAR StringLexer::GetNextCharacter(BOOL *pfIsEscaped)
+{
+ *pfIsEscaped = FALSE;
+
+ WCHAR wcCurrentChar = GetRawCharacter(); // DecodeUTF16Character()
+ if (wcCurrentChar == L'\\')
+ {
+ WCHAR wcTempChar = GetRawCharacter(); // DecodeUTF16Character()
+
+ if (m_fSupportEscaping)
+ {
+ // Handle standard escapes
+ switch (wcTempChar)
+ {
+ case L'"':
+ case L'\'':
+ case L',':
+ case L'\\':
+ case L'/':
+ case L'=':
+ break;
+ case L't':
+ wcTempChar = 9;
+ break;
+ case L'n':
+ wcTempChar = 10;
+ break;
+ case L'r':
+ wcTempChar = 13;
+ break;
+ case L'u':
+ wcTempChar = ParseUnicode();
+ break;
+ default:
+ return INVALID_CHARACTER;
+ }
+
+ *pfIsEscaped = TRUE;
+ wcCurrentChar = wcTempChar;
+ }
+ else
+ {
+ // Do not handle escapes except for quotes
+ switch (wcTempChar)
+ {
+ case L'"':
+ case L'\'':
+ *pfIsEscaped = TRUE;
+ wcCurrentChar = wcTempChar;
+ break;
+ default:
+ PushRawCharacter();
+ break;
+ }
+ }
+ }
+
+ return wcCurrentChar;
+}
+
+WCHAR StringLexer::ParseUnicode()
+{
+ int nCharacters = 0;
+ WCHAR wcUnicodeChar = 0;
+
+ for(;;)
+ {
+ WCHAR wcCurrentChar = DecodeUTF16Character();
+ nCharacters++;
+
+ if (wcCurrentChar == L';')
+ {
+ break;
+ }
+ else if ((wcCurrentChar == INVALID_CHARACTER) || (nCharacters >= 9))
+ {
+ return INVALID_CHARACTER;
+ }
+
+ wcUnicodeChar <<= 4;
+
+ if ((wcCurrentChar >= L'0') && (wcCurrentChar <= L'9'))
+ {
+ wcUnicodeChar += (wcCurrentChar - L'0');
+ }
+ else if ((wcCurrentChar >= L'a') && (wcCurrentChar <= L'f'))
+ {
+ wcUnicodeChar += (wcCurrentChar - L'a') + 10;
+ }
+ else if ((wcCurrentChar >= L'A') && (wcCurrentChar <= L'F'))
+ {
+ wcUnicodeChar += (wcCurrentChar - L'A') + 10;
+ }
+ else
+ {
+ return INVALID_CHARACTER;
+ }
+ }
+
+ return wcUnicodeChar;
+}
+
+#endif