diff options
Diffstat (limited to 'doc/index.py')
-rwxr-xr-x | doc/index.py | 1248 |
1 files changed, 0 insertions, 1248 deletions
diff --git a/doc/index.py b/doc/index.py deleted file mode 100755 index 3cb3fece..00000000 --- a/doc/index.py +++ /dev/null @@ -1,1248 +0,0 @@ -#!/usr/bin/python -u -# -# imports the API description and fills up a database with -# name relevance to modules, functions or web pages -# -# Operation needed: -# ================= -# -# install mysqld, the python wrappers for mysql and libxml2, start mysqld -# Change the root passwd of mysql: -# mysqladmin -u root password new_password -# Create the new database xmlsoft -# mysqladmin -p create xmlsoft -# Create a database user 'veillard' and give him passord access -# change veillard and abcde with the right user name and passwd -# mysql -p -# password: -# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost -# IDENTIFIED BY 'abcde' WITH GRANT OPTION; -# -# As the user check the access: -# mysql -p xmlsoft -# Enter password: -# Welcome to the MySQL monitor.... -# mysql> use xmlsoft -# Database changed -# mysql> quit -# Bye -# -# Then run the script in the doc subdir, it will create the XSLTsymbols and -# word tables and populate them with informations extracted from -# the libxml2-api.xml API description, and make then accessible read-only -# by nobody@loaclhost the user expected to be Apache's one -# -# On the Apache configuration, make sure you have php support enabled -# - -import MySQLdb -import libxml2 -import sys -import string -import os - -# -# We are not interested in parsing errors here -# -def callback(ctx, str): - return -libxml2.registerErrorHandler(callback, None) - -# -# The dictionary of tables required and the SQL command needed -# to create them -# -TABLES={ - "XSLTsymbols" : """CREATE TABLE XSLTsymbols ( - name varchar(255) BINARY NOT NULL, - module varchar(255) BINARY NOT NULL, - type varchar(25) NOT NULL, - descr varchar(255), - UNIQUE KEY name (name), - KEY module (module))""", - "XSLTwords" : """CREATE TABLE XSLTwords ( - name varchar(50) BINARY NOT NULL, - symbol varchar(255) BINARY NOT NULL, - relevance int, - KEY name (name), - KEY symbol (symbol), - UNIQUE KEY ID (name, symbol))""", - "XSLTwordsHTML" : """CREATE TABLE XSLTwordsHTML ( - name varchar(50) BINARY NOT NULL, - resource varchar(255) BINARY NOT NULL, - section varchar(255), - id varchar(50), - relevance int, - KEY name (name), - KEY resource (resource), - UNIQUE KEY ref (name, resource))""", - "XSLTwordsArchive" : """CREATE TABLE XSLTwordsArchive ( - name varchar(50) BINARY NOT NULL, - ID int(11) NOT NULL, - relevance int, - KEY name (name), - UNIQUE KEY ref (name, ID))""", - "XSLTpages" : """CREATE TABLE XSLTpages ( - resource varchar(255) BINARY NOT NULL, - title varchar(255) BINARY NOT NULL, - UNIQUE KEY name (resource))""", - "archives" : """CREATE TABLE archives ( - ID int(11) NOT NULL auto_increment, - resource varchar(255) BINARY NOT NULL, - title varchar(255) BINARY NOT NULL, - UNIQUE KEY id (ID,resource(255)), - INDEX (ID), - INDEX (resource))""", - "Queries" : """CREATE TABLE Queries ( - ID int(11) NOT NULL auto_increment, - Value varchar(50) NOT NULL, - Count int(11) NOT NULL, - UNIQUE KEY id (ID,Value(35)), - INDEX (ID))""", -} - -# -# The XML API description file to parse -# -API="libxslt-api.xml" -DB=None - -######################################################################### -# # -# MySQL database interfaces # -# # -######################################################################### -def createTable(db, name): - global TABLES - - if db == None: - return -1 - if name == None: - return -1 - c = db.cursor() - - ret = c.execute("DROP TABLE IF EXISTS %s" % (name)) - if ret == 1: - print "Removed table %s" % (name) - print "Creating table %s" % (name) - try: - ret = c.execute(TABLES[name]) - except: - print "Failed to create table %s" % (name) - return -1 - return ret - -def checkTables(db): - global TABLES - - if db == None: - return -1 - c = db.cursor() - nbtables = c.execute("show tables") - print "Found %d tables" % (nbtables) - tables = {} - i = 0 - while i < nbtables: - l = c.fetchone() - name = l[0] - tables[name] = {} - i = i + 1 - - for table in TABLES.keys(): - if not tables.has_key(table): - print "table %s missing" % (table) - createTable(db, table) - try: - ret = c.execute("SELECT count(*) from %s" % table); - row = c.fetchone() - print "Table %s contains %d records" % (table, row[0]) - except: - print "Troubles with table %s : repairing" % (table) - ret = c.execute("repair table %s" % table); - print "repairing returned %d" % (ret) - ret = c.execute("SELECT count(*) from %s" % table); - row = c.fetchone() - print "Table %s contains %d records" % (table, row[0]) - print "checkTables finished" - - # make sure apache can access the tables read-only - try: - ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost") - ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost") - except: - pass - return 0 - -def openMySQL(db="xmlsoft", passwd=None): - global DB - - if passwd == None: - try: - passwd = os.environ["MySQL_PASS"] - except: - print "No password available, set environment MySQL_PASS" - sys.exit(1) - - DB = MySQLdb.connect(passwd=passwd, db=db) - if DB == None: - return -1 - ret = checkTables(DB) - return ret - -def updateWord(name, symbol, relevance): - global DB - - if DB == None: - openMySQL() - if DB == None: - return -1 - if name == None: - return -1 - if symbol == None: - return -1 - - c = DB.cursor() - try: - ret = c.execute( -"""INSERT INTO XSLTwords (name, symbol, relevance) VALUES ('%s','%s', %d)""" % - (name, symbol, relevance)) - except: - try: - ret = c.execute( - """UPDATE XSLTwords SET relevance = %d where name = '%s' and symbol = '%s'""" % - (relevance, name, symbol)) - except: - print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance) - print "UPDATE XSLTwords SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol) - print sys.exc_type, sys.exc_value - return -1 - - return ret - -def updateSymbol(name, module, type, desc): - global DB - - updateWord(name, name, 50) - if DB == None: - openMySQL() - if DB == None: - return -1 - if name == None: - return -1 - if module == None: - return -1 - if type == None: - return -1 - - try: - desc = string.replace(desc, "'", " ") - l = string.split(desc, ".") - desc = l[0] - desc = desc[0:99] - except: - desc = "" - - c = DB.cursor() - try: - ret = c.execute( -"""INSERT INTO XSLTsymbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" % - (name, module, type, desc)) - except: - try: - ret = c.execute( -"""UPDATE XSLTsymbols SET module='%s', type='%s', descr='%s' where name='%s'""" % - (module, type, desc, name)) - except: - print "Update symbol (%s, %s, %s) failed command" % (name, module, type) - print """UPDATE XSLTsymbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name) - print sys.exc_type, sys.exc_value - return -1 - - return ret - -def addFunction(name, module, desc = ""): - return updateSymbol(name, module, 'function', desc) - -def addMacro(name, module, desc = ""): - return updateSymbol(name, module, 'macro', desc) - -def addEnum(name, module, desc = ""): - return updateSymbol(name, module, 'enum', desc) - -def addStruct(name, module, desc = ""): - return updateSymbol(name, module, 'struct', desc) - -def addConst(name, module, desc = ""): - return updateSymbol(name, module, 'const', desc) - -def addType(name, module, desc = ""): - return updateSymbol(name, module, 'type', desc) - -def addFunctype(name, module, desc = ""): - return updateSymbol(name, module, 'functype', desc) - -def addPage(resource, title): - global DB - - if DB == None: - openMySQL() - if DB == None: - return -1 - if resource == None: - return -1 - - c = DB.cursor() - try: - ret = c.execute( - """INSERT INTO XSLTpages (resource, title) VALUES ('%s','%s')""" % - (resource, title)) - except: - try: - ret = c.execute( - """UPDATE XSLTpages SET title='%s' WHERE resource='%s'""" % - (title, resource)) - except: - print "Update symbol (%s, %s, %s) failed command" % (name, module, type) - print """UPDATE XSLTpages SET title='%s' WHERE resource='%s'""" % (title, resource) - print sys.exc_type, sys.exc_value - return -1 - - return ret - -def updateWordHTML(name, resource, desc, id, relevance): - global DB - - if DB == None: - openMySQL() - if DB == None: - return -1 - if name == None: - return -1 - if resource == None: - return -1 - if id == None: - id = "" - if desc == None: - desc = "" - else: - try: - desc = string.replace(desc, "'", " ") - desc = desc[0:99] - except: - desc = "" - - c = DB.cursor() - try: - ret = c.execute( -"""INSERT INTO XSLTwordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" % - (name, resource, desc, id, relevance)) - except: - try: - ret = c.execute( -"""UPDATE XSLTwordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % - (desc, id, relevance, name, resource)) - except: - print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance) - print """UPDATE XSLTwordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource) - print sys.exc_type, sys.exc_value - return -1 - - return ret - -def checkXMLMsgArchive(url): - global DB - - if DB == None: - openMySQL() - if DB == None: - return -1 - if url == None: - return -1 - - c = DB.cursor() - try: - ret = c.execute( - """SELECT ID FROM archives WHERE resource='%s'""" % (url)) - row = c.fetchone() - if row == None: - return -1 - except: - return -1 - - return row[0] - -def addXMLMsgArchive(url, title): - global DB - - if DB == None: - openMySQL() - if DB == None: - return -1 - if url == None: - return -1 - if title == None: - title = "" - else: - title = string.replace(title, "'", " ") - title = title[0:99] - - c = DB.cursor() - try: - cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title) - ret = c.execute(cmd) - cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url) - ret = c.execute(cmd) - row = c.fetchone() - if row == None: - print "addXMLMsgArchive failed to get the ID: %s" % (url) - return -1 - except: - print "addXMLMsgArchive failed command: %s" % (cmd) - return -1 - - return((int)(row[0])) - -def updateWordArchive(name, id, relevance): - global DB - - if DB == None: - openMySQL() - if DB == None: - return -1 - if name == None: - return -1 - if id == None: - return -1 - - c = DB.cursor() - try: - ret = c.execute( -"""INSERT INTO XSLTwordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" % - (name, id, relevance)) - except: - try: - ret = c.execute( -"""UPDATE XSLTwordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % - (relevance, name, id)) - except: - print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance) - print """UPDATE XSLTwordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id) - print sys.exc_type, sys.exc_value - return -1 - - return ret - -######################################################################### -# # -# Word dictionary and analysis routines # -# # -######################################################################### - -# -# top 100 english word without the one len < 3 + own set -# -dropWords = { - 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0, - 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0, - 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0, - 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0, - 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0, - 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0, - 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0, - 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0, - 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0, - 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0, - 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0, - 'down':0, - 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0, -} - -wordsDict = {} -wordsDictHTML = {} -wordsDictArchive = {} - -def cleanupWordsString(str): - str = string.replace(str, ".", " ") - str = string.replace(str, "!", " ") - str = string.replace(str, "?", " ") - str = string.replace(str, ",", " ") - str = string.replace(str, "'", " ") - str = string.replace(str, '"', " ") - str = string.replace(str, ";", " ") - str = string.replace(str, "(", " ") - str = string.replace(str, ")", " ") - str = string.replace(str, "{", " ") - str = string.replace(str, "}", " ") - str = string.replace(str, "<", " ") - str = string.replace(str, ">", " ") - str = string.replace(str, "=", " ") - str = string.replace(str, "/", " ") - str = string.replace(str, "*", " ") - str = string.replace(str, ":", " ") - str = string.replace(str, "#", " ") - str = string.replace(str, "\\", " ") - str = string.replace(str, "\n", " ") - str = string.replace(str, "\r", " ") - str = string.replace(str, "\xc2", " ") - str = string.replace(str, "\xa0", " ") - return str - -def cleanupDescrString(str): - str = string.replace(str, "'", " ") - str = string.replace(str, "\n", " ") - str = string.replace(str, "\r", " ") - str = string.replace(str, "\xc2", " ") - str = string.replace(str, "\xa0", " ") - l = string.split(str) - str = string.join(str) - return str - -def splitIdentifier(str): - ret = [] - while str != "": - cur = string.lower(str[0]) - str = str[1:] - if ((cur < 'a') or (cur > 'z')): - continue - while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'): - cur = cur + string.lower(str[0]) - str = str[1:] - while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'): - cur = cur + str[0] - str = str[1:] - while (str != "") and (str[0] >= '0') and (str[0] <= '9'): - str = str[1:] - ret.append(cur) - return ret - -def addWord(word, module, symbol, relevance): - global wordsDict - - if word == None or len(word) < 3: - return -1 - if module == None or symbol == None: - return -1 - if dropWords.has_key(word): - return 0 - if ord(word[0]) > 0x80: - return 0 - - if wordsDict.has_key(word): - d = wordsDict[word] - if d == None: - return 0 - if len(d) > 500: - wordsDict[word] = None - return 0 - try: - relevance = relevance + d[(module, symbol)] - except: - pass - else: - wordsDict[word] = {} - wordsDict[word][(module, symbol)] = relevance - return relevance - -def addString(str, module, symbol, relevance): - if str == None or len(str) < 3: - return -1 - ret = 0 - str = cleanupWordsString(str) - l = string.split(str) - for word in l: - if len(word) > 2: - ret = ret + addWord(word, module, symbol, 5) - - return ret - -def addWordHTML(word, resource, id, section, relevance): - global wordsDictHTML - - if word == None or len(word) < 3: - return -1 - if resource == None or section == None: - return -1 - if dropWords.has_key(word): - return 0 - if ord(word[0]) > 0x80: - return 0 - - section = cleanupDescrString(section) - - if wordsDictHTML.has_key(word): - d = wordsDictHTML[word] - if d == None: - print "skipped %s" % (word) - return 0 - try: - (r,i,s) = d[resource] - if i != None: - id = i - if s != None: - section = s - relevance = relevance + r - except: - pass - else: - wordsDictHTML[word] = {} - d = wordsDictHTML[word]; - d[resource] = (relevance, id, section) - return relevance - -def addStringHTML(str, resource, id, section, relevance): - if str == None or len(str) < 3: - return -1 - ret = 0 - str = cleanupWordsString(str) - l = string.split(str) - for word in l: - if len(word) > 2: - try: - r = addWordHTML(word, resource, id, section, relevance) - if r < 0: - print "addWordHTML failed: %s %s" % (word, resource) - ret = ret + r - except: - print "addWordHTML failed: %s %s %d" % (word, resource, relevance) - print sys.exc_type, sys.exc_value - - return ret - -def addWordArchive(word, id, relevance): - global wordsDictArchive - - if word == None or len(word) < 3: - return -1 - if id == None or id == -1: - return -1 - if dropWords.has_key(word): - return 0 - if ord(word[0]) > 0x80: - return 0 - - if wordsDictArchive.has_key(word): - d = wordsDictArchive[word] - if d == None: - print "skipped %s" % (word) - return 0 - try: - r = d[id] - relevance = relevance + r - except: - pass - else: - wordsDictArchive[word] = {} - d = wordsDictArchive[word]; - d[id] = relevance - return relevance - -def addStringArchive(str, id, relevance): - if str == None or len(str) < 3: - return -1 - ret = 0 - str = cleanupWordsString(str) - l = string.split(str) - for word in l: - i = len(word) - if i > 2: - try: - r = addWordArchive(word, id, relevance) - if r < 0: - print "addWordArchive failed: %s %s" % (word, id) - else: - ret = ret + r - except: - print "addWordArchive failed: %s %s %d" % (word, id, relevance) - print sys.exc_type, sys.exc_value - return ret - -######################################################################### -# # -# XML API description analysis # -# # -######################################################################### - -def loadAPI(filename): - doc = libxml2.parseFile(filename) - print "loaded %s" % (filename) - return doc - -def foundExport(file, symbol): - if file == None: - return 0 - if symbol == None: - return 0 - addFunction(symbol, file) - l = splitIdentifier(symbol) - for word in l: - addWord(word, file, symbol, 10) - return 1 - -def analyzeAPIFile(top): - count = 0 - name = top.prop("name") - cur = top.children - while cur != None: - if cur.type == 'text': - cur = cur.next - continue - if cur.name == "exports": - count = count + foundExport(name, cur.prop("symbol")) - else: - print "unexpected element %s in API doc <file name='%s'>" % (name) - cur = cur.next - return count - -def analyzeAPIFiles(top): - count = 0 - cur = top.children - - while cur != None: - if cur.type == 'text': - cur = cur.next - continue - if cur.name == "file": - count = count + analyzeAPIFile(cur) - else: - print "unexpected element %s in API doc <files>" % (cur.name) - cur = cur.next - return count - -def analyzeAPIEnum(top): - file = top.prop("file") - if file == None: - return 0 - symbol = top.prop("name") - if symbol == None: - return 0 - - addEnum(symbol, file) - l = splitIdentifier(symbol) - for word in l: - addWord(word, file, symbol, 10) - - return 1 - -def analyzeAPIConst(top): - file = top.prop("file") - if file == None: - return 0 - symbol = top.prop("name") - if symbol == None: - return 0 - - addConst(symbol, file) - l = splitIdentifier(symbol) - for word in l: - addWord(word, file, symbol, 10) - - return 1 - -def analyzeAPIType(top): - file = top.prop("file") - if file == None: - return 0 - symbol = top.prop("name") - if symbol == None: - return 0 - - addType(symbol, file) - l = splitIdentifier(symbol) - for word in l: - addWord(word, file, symbol, 10) - return 1 - -def analyzeAPIFunctype(top): - file = top.prop("file") - if file == None: - return 0 - symbol = top.prop("name") - if symbol == None: - return 0 - - addFunctype(symbol, file) - l = splitIdentifier(symbol) - for word in l: - addWord(word, file, symbol, 10) - return 1 - -def analyzeAPIStruct(top): - file = top.prop("file") - if file == None: - return 0 - symbol = top.prop("name") - if symbol == None: - return 0 - - addStruct(symbol, file) - l = splitIdentifier(symbol) - for word in l: - addWord(word, file, symbol, 10) - - info = top.prop("info") - if info != None: - info = string.replace(info, "'", " ") - info = string.strip(info) - l = string.split(info) - for word in l: - if len(word) > 2: - addWord(word, file, symbol, 5) - return 1 - -def analyzeAPIMacro(top): - file = top.prop("file") - if file == None: - return 0 - symbol = top.prop("name") - if symbol == None: - return 0 - symbol = string.replace(symbol, "'", " ") - symbol = string.strip(symbol) - - info = None - cur = top.children - while cur != None: - if cur.type == 'text': - cur = cur.next - continue - if cur.name == "info": - info = cur.content - break - cur = cur.next - - l = splitIdentifier(symbol) - for word in l: - addWord(word, file, symbol, 10) - - if info == None: - addMacro(symbol, file) - print "Macro %s description has no <info>" % (symbol) - return 0 - - info = string.replace(info, "'", " ") - info = string.strip(info) - addMacro(symbol, file, info) - l = string.split(info) - for word in l: - if len(word) > 2: - addWord(word, file, symbol, 5) - return 1 - -def analyzeAPIFunction(top): - file = top.prop("file") - if file == None: - return 0 - symbol = top.prop("name") - if symbol == None: - return 0 - - symbol = string.replace(symbol, "'", " ") - symbol = string.strip(symbol) - info = None - cur = top.children - while cur != None: - if cur.type == 'text': - cur = cur.next - continue - if cur.name == "info": - info = cur.content - elif cur.name == "return": - rinfo = cur.prop("info") - if rinfo != None: - rinfo = string.replace(rinfo, "'", " ") - rinfo = string.strip(rinfo) - addString(rinfo, file, symbol, 7) - elif cur.name == "arg": - ainfo = cur.prop("info") - if ainfo != None: - ainfo = string.replace(ainfo, "'", " ") - ainfo = string.strip(ainfo) - addString(ainfo, file, symbol, 5) - name = cur.prop("name") - if name != None: - name = string.replace(name, "'", " ") - name = string.strip(name) - addWord(name, file, symbol, 7) - cur = cur.next - if info == None: - print "Function %s description has no <info>" % (symbol) - addFunction(symbol, file, "") - else: - info = string.replace(info, "'", " ") - info = string.strip(info) - addFunction(symbol, file, info) - addString(info, file, symbol, 5) - - l = splitIdentifier(symbol) - for word in l: - addWord(word, file, symbol, 10) - - return 1 - -def analyzeAPISymbols(top): - count = 0 - cur = top.children - - while cur != None: - if cur.type == 'text': - cur = cur.next - continue - if cur.name == "macro": - count = count + analyzeAPIMacro(cur) - elif cur.name == "function": - count = count + analyzeAPIFunction(cur) - elif cur.name == "const": - count = count + analyzeAPIConst(cur) - elif cur.name == "typedef": - count = count + analyzeAPIType(cur) - elif cur.name == "struct": - count = count + analyzeAPIStruct(cur) - elif cur.name == "enum": - count = count + analyzeAPIEnum(cur) - elif cur.name == "functype": - count = count + analyzeAPIFunctype(cur) - else: - print "unexpected element %s in API doc <files>" % (cur.name) - cur = cur.next - return count - -def analyzeAPI(doc): - count = 0 - if doc == None: - return -1 - root = doc.getRootElement() - if root.name != "api": - print "Unexpected root name" - return -1 - cur = root.children - while cur != None: - if cur.type == 'text': - cur = cur.next - continue - if cur.name == "files": - pass -# count = count + analyzeAPIFiles(cur) - elif cur.name == "symbols": - count = count + analyzeAPISymbols(cur) - else: - print "unexpected element %s in API doc" % (cur.name) - cur = cur.next - return count - -######################################################################### -# # -# Web pages parsing and analysis # -# # -######################################################################### - -import glob - -def analyzeHTMLText(doc, resource, p, section, id): - words = 0 - try: - content = p.content - words = words + addStringHTML(content, resource, id, section, 5) - except: - return -1 - return words - -def analyzeHTMLPara(doc, resource, p, section, id): - words = 0 - try: - content = p.content - words = words + addStringHTML(content, resource, id, section, 5) - except: - return -1 - return words - -def analyzeHTMLPre(doc, resource, p, section, id): - words = 0 - try: - content = p.content - words = words + addStringHTML(content, resource, id, section, 5) - except: - return -1 - return words - -def analyzeHTML(doc, resource, p, section, id): - words = 0 - try: - content = p.content - words = words + addStringHTML(content, resource, id, section, 5) - except: - return -1 - return words - -def analyzeHTML(doc, resource): - para = 0; - ctxt = doc.xpathNewContext() - try: - res = ctxt.xpathEval("//head/title") - title = res[0].content - except: - title = "Page %s" % (resource) - addPage(resource, title) - try: - items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()") - section = title - id = "" - for item in items: - if item.name == 'h1' or item.name == 'h2' or item.name == 'h3': - section = item.content - if item.prop("id"): - id = item.prop("id") - elif item.prop("name"): - id = item.prop("name") - elif item.type == 'text': - analyzeHTMLText(doc, resource, item, section, id) - para = para + 1 - elif item.name == 'p': - analyzeHTMLPara(doc, resource, item, section, id) - para = para + 1 - elif item.name == 'pre': - analyzeHTMLPre(doc, resource, item, section, id) - para = para + 1 - else: - print "Page %s, unexpected %s element" % (resource, item.name) - except: - print "Page %s: problem analyzing" % (resource) - print sys.exc_type, sys.exc_value - - return para - -def analyzeHTMLPages(): - ret = 0 - HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html") - for html in HTMLfiles: - if html[0:3] == "API": - continue - if html == "xslt.html": - continue - try: - doc = libxml2.htmlParseFile(html, None) - res = analyzeHTML(doc, html) - print "Parsed %s : %d paragraphs" % (html, res) - ret = ret + 1 - except: - print "could not parse %s" % (html) - return ret - -######################################################################### -# # -# Mail archives parsing and analysis # -# # -######################################################################### - -import time - -def getXMLDateArchive(t = None): - if t == None: - t = time.time() - T = time.gmtime(t) - month = time.strftime("%B", T) - year = T[0] - url = "http://mail.gnome.org/archives/xslt/%d-%s/date.html" % (year, month) - return url - -def scanXMLMsgArchive(url, title, force = 0): - if url == None or title == None: - return 0 - - ID = checkXMLMsgArchive(url) - if force == 0 and ID != -1: - return 0 - - if ID == -1: - ID = addXMLMsgArchive(url, title) - if ID == -1: - return 0 - - try: - print "Loading %s" % (url) - doc = libxml2.htmlParseFile(url, None); - except: - doc = None - if doc == None: - print "Failed to parse %s" % (url) - return 0 - - addStringArchive(title, ID, 20) - ctxt = doc.xpathNewContext() - texts = ctxt.xpathEval("//pre//text()") - for text in texts: - addStringArchive(text.content, ID, 5) - - return 1 - -def scanXMLDateArchive(t = None, force = 0): - global wordsDictArchive - - wordsDictArchive = {} - - url = getXMLDateArchive(t) - print "loading %s" % (url) - try: - doc = libxml2.htmlParseFile(url, None); - except: - doc = None - if doc == None: - print "Failed to parse %s" % (url) - return -1 - ctxt = doc.xpathNewContext() - anchors = ctxt.xpathEval("//a[@href]") - links = 0 - newmsg = 0 - for anchor in anchors: - href = anchor.prop("href") - if href == None or href[0:3] != "msg": - continue - try: - links = links + 1 - - msg = libxml2.buildURI(href, url) - title = anchor.content - if title != None and title[0:4] == 'Re: ': - title = title[4:] - if title != None and title[0:6] == '[xml] ': - title = title[6:] - if title != None and title[0:7] == '[xslt] ': - title = title[7:] - newmsg = newmsg + scanXMLMsgArchive(msg, title, force) - - except: - pass - - return newmsg - - -######################################################################### -# # -# Main code: open the DB, the API XML and analyze it # -# # -######################################################################### -try: - openMySQL() -except: - print "Failed to open the database" - print sys.exc_type, sys.exc_value - sys.exit(1) - -def analyzeArchives(t = None, force = 0): - global wordsDictArchive - - ret = scanXMLDateArchive(t, force) - print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret) - - i = 0 - skipped = 0 - for word in wordsDictArchive.keys(): - refs = wordsDictArchive[word] - if refs == None: - skipped = skipped + 1 - continue; - for id in refs.keys(): - relevance = refs[id] - updateWordArchive(word, id, relevance) - i = i + 1 - - print "Found %d associations in HTML pages" % (i) - -def analyzeHTMLTop(): - global wordsDictHTML - - ret = analyzeHTMLPages() - print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret) - - i = 0 - skipped = 0 - for word in wordsDictHTML.keys(): - refs = wordsDictHTML[word] - if refs == None: - skipped = skipped + 1 - continue; - for resource in refs.keys(): - (relevance, id, section) = refs[resource] - updateWordHTML(word, resource, section, id, relevance) - i = i + 1 - - print "Found %d associations in HTML pages" % (i) - -def analyzeAPITop(): - global wordsDict - global API - - try: - doc = loadAPI(API) - ret = analyzeAPI(doc) - print "Analyzed %d blocs" % (ret) - doc.freeDoc() - except: - print "Failed to parse and analyze %s" % (API) - print sys.exc_type, sys.exc_value - sys.exit(1) - - print "Indexed %d words" % (len(wordsDict)) - i = 0 - skipped = 0 - for word in wordsDict.keys(): - refs = wordsDict[word] - if refs == None: - skipped = skipped + 1 - continue; - for (module, symbol) in refs.keys(): - updateWord(word, symbol, refs[(module, symbol)]) - i = i + 1 - - print "Found %d associations, skipped %d words" % (i, skipped) - -def usage(): - print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]" - sys.exit(1) - -def main(): - args = sys.argv[1:] - force = 0 - if args: - i = 0 - while i < len(args): - if args[i] == '--force': - force = 1 - elif args[i] == '--archive': - analyzeArchives(None, force) - elif args[i] == '--archive-year': - i = i + 1; - year = args[i] - months = ["January" , "February", "March", "April", "May", - "June", "July", "August", "September", "October", - "November", "December"]; - for month in months: - try: - str = "%s-%s" % (year, month) - T = time.strptime(str, "%Y-%B") - t = time.mktime(T) + 3600 * 24 * 10; - analyzeArchives(t, force) - except: - print "Failed to index month archive:" - print sys.exc_type, sys.exc_value - elif args[i] == '--archive-month': - i = i + 1; - month = args[i] - try: - T = time.strptime(month, "%Y-%B") - t = time.mktime(T) + 3600 * 24 * 10; - analyzeArchives(t, force) - except: - print "Failed to index month archive:" - print sys.exc_type, sys.exc_value - elif args[i] == '--API': - analyzeAPITop() - elif args[i] == '--docs': - analyzeHTMLTop() - else: - usage() - i = i + 1 - else: - usage() - -if __name__ == "__main__": - main() |