1 files changed, 111 insertions, 0 deletions
diff --git a/autobahn/utf8validator.py b/autobahn/utf8validator.py
new file mode 100644
index 0000000..2b9b3ec
--- /dev/null
+++ b/autobahn/utf8validator.py
@@ -0,0 +1,111 @@
+###############################################################################
+##
+##  Copyright 2011 Tavendo GmbH
+##
+##  Note:
+##
+##  This code is a Python implementation of the algorithm
+##
+##            "Flexible and Economical UTF-8 Decoder"
+##
+##  by Bjoern Hoehrmann
+##
+##       bjoern@hoehrmann.de
+##       http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+##
+##  Licensed under the Apache License, Version 2.0 (the "License");
+##  you may not use this file except in compliance with the License.
+##  You may obtain a copy of the License at
+##
+##      http://www.apache.org/licenses/LICENSE-2.0
+##
+##  Unless required by applicable law or agreed to in writing, software
+##  distributed under the License is distributed on an "AS IS" BASIS,
+##  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+##  See the License for the specific language governing permissions and
+##  limitations under the License.
+##
+###############################################################################
+
+
+class Utf8Validator:
+   """
+   Incremental UTF-8 validator with constant memory consumption (minimal state).
+
+   Implements the algorithm "Flexible and Economical UTF-8 Decoder" by
+   Bjoern Hoehrmann (http://bjoern.hoehrmann.de/utf-8/decoder/dfa/).
+   """
+
+   ## DFA transitions
+   UTF8VALIDATOR_DFA = [
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 00..1f
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 20..3f
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 40..5f
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 60..7f
+     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, # 80..9f
+     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, # a0..bf
+     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # c0..df
+     0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, # e0..ef
+     0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, # f0..ff
+     0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, # s0..s0
+     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, # s1..s2
+     1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, # s3..s4
+     1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, # s5..s6
+     1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # s7..s8
+   ]
+
+   UTF8_ACCEPT = 0
+   UTF8_REJECT = 1
+
+   def __init__(self):
+      self.reset()
+
+   def decode(self, b):
+      """
+      Eat one UTF-8 octet, and validate on the fly.
+
+      Returns UTF8_ACCEPT when enough octets have been consumed, in which case
+      self.codepoint contains the decoded Unicode code point.
+
+      Returns UTF8_REJECT when invalid UTF-8 was encountered.
+
+      Returns some other positive integer when more octets need to be eaten.
+      """
+      type = Utf8Validator.UTF8VALIDATOR_DFA[b]
+      if self.state != Utf8Validator.UTF8_ACCEPT:
+         self.codepoint = (b & 0x3f) | (self.codepoint << 6)
+      else:
+         self.codepoint = (0xff >> type) & b
+      self.state = Utf8Validator.UTF8VALIDATOR_DFA[256 + self.state * 16 + type]
+      return self.state
+
+   def reset(self):
+      """
+      Reset validator to start new incremental UTF-8 decode/validation.
+      """
+      self.state = Utf8Validator.UTF8_ACCEPT
+      self.codepoint = 0
+      self.i = 0
+
+   def validate(self, ba):
+      """
+      Incrementally validate a chunk of bytes provided as string.
+
+      Will return a quad (valid?, endsOnCodePoint?, currentIndex, totalIndex).
+
+      As soon as an octet is encountered which renders the octet sequence
+      invalid, a quad with valid? == False is returned. currentIndex returns
+      the index within the currently consumed chunk, and totalIndex the
+      index within the total consumed sequence that was the point of bail out.
+      When valid? == True, currentIndex will be len(ba) and totalIndex the
+      total amount of consumed bytes.
+      """
+      l = len(ba)
+      for i in xrange(l):
+         ## optimized version of decode(), since we are not interested in actual code points
+         self.state = Utf8Validator.UTF8VALIDATOR_DFA[256 + (self.state << 4) + Utf8Validator.UTF8VALIDATOR_DFA[ord(ba[i])]]
+         if self.state == Utf8Validator.UTF8_REJECT:
+            self.i += i
+            return False, False, i, self.i
+      self.i += l
+      return True, self.state == Utf8Validator.UTF8_ACCEPT, l, self.i