mirror of
https://gitlab.com/libvirt/libvirt.git
synced 2025-01-03 19:45:21 +00:00
f34fdd5ab6
This is particularly useful on operating systems that don't ship Python as part of the base system (eg. FreeBSD) while still working just as well as it did before on Linux. While at it, make it explicit that our scripts are only going to work with Python 2, and remove the usage of unbuffered I/O, which as far as I can tell has no effect on the output files. Signed-off-by: Andrea Bolognani <abologna@redhat.com>
1267 lines
36 KiB
Python
Executable File
1267 lines
36 KiB
Python
Executable File
#!/usr/bin/env python2
|
|
#
|
|
# imports the API description and fills up a database with
|
|
# name relevance to modules, functions or web pages
|
|
#
|
|
# Operation needed:
|
|
# =================
|
|
#
|
|
# install mysqld, the python wrappers for mysql and libxml2, start mysqld
|
|
# - mysql-server
|
|
# - mysql
|
|
# - php-mysql
|
|
# - MySQL-python
|
|
# Change the root passwd of mysql:
|
|
# mysqladmin -u root password new_password
|
|
# Create the new database libvir
|
|
# mysqladmin -p create libvir
|
|
# Create a database user 'veillard' and give him password access
|
|
# change veillard and abcde with the right user name and passwd
|
|
# mysql -p
|
|
# password:
|
|
# mysql> GRANT ALL PRIVILEGES ON libvir TO veillard@localhost
|
|
# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
|
|
# mysql> GRANT ALL PRIVILEGES ON libvir.* TO veillard@localhost
|
|
# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
|
|
#
|
|
# As the user check the access:
|
|
# mysql -p libvir
|
|
# Enter password:
|
|
# Welcome to the MySQL monitor....
|
|
# mysql> use libvir
|
|
# Database changed
|
|
# mysql> quit
|
|
# Bye
|
|
#
|
|
# Then run the script in the doc subdir, it will create the symbols and
|
|
# word tables and populate them with information extracted from
|
|
# the libvirt-api.xml API description, and make then accessible read-only
|
|
# by nobody@loaclhost the user expected to be Apache's one
|
|
#
|
|
# On the Apache configuration, make sure you have php support enabled
|
|
#
|
|
|
|
import MySQLdb
|
|
import libxml2
|
|
import sys
|
|
import string
|
|
import os
|
|
|
|
#
|
|
# We are not interested in parsing errors here
|
|
#
|
|
def callback(ctx, str):
|
|
return
|
|
libxml2.registerErrorHandler(callback, None)
|
|
|
|
#
|
|
# The dictionary of tables required and the SQL command needed
|
|
# to create them
|
|
#
|
|
TABLES={
|
|
"symbols" : """CREATE TABLE symbols (
|
|
name varchar(255) BINARY NOT NULL,
|
|
module varchar(255) BINARY NOT NULL,
|
|
type varchar(25) NOT NULL,
|
|
descr varchar(255),
|
|
UNIQUE KEY name (name),
|
|
KEY module (module))""",
|
|
"words" : """CREATE TABLE words (
|
|
name varchar(50) BINARY NOT NULL,
|
|
symbol varchar(255) BINARY NOT NULL,
|
|
relevance int,
|
|
KEY name (name),
|
|
KEY symbol (symbol),
|
|
UNIQUE KEY ID (name, symbol))""",
|
|
"wordsHTML" : """CREATE TABLE wordsHTML (
|
|
name varchar(50) BINARY NOT NULL,
|
|
resource varchar(255) BINARY NOT NULL,
|
|
section varchar(255),
|
|
id varchar(50),
|
|
relevance int,
|
|
KEY name (name),
|
|
KEY resource (resource),
|
|
UNIQUE KEY ref (name, resource))""",
|
|
"wordsArchive" : """CREATE TABLE wordsArchive (
|
|
name varchar(50) BINARY NOT NULL,
|
|
ID int(11) NOT NULL,
|
|
relevance int,
|
|
KEY name (name),
|
|
UNIQUE KEY ref (name, ID))""",
|
|
"pages" : """CREATE TABLE pages (
|
|
resource varchar(255) BINARY NOT NULL,
|
|
title varchar(255) BINARY NOT NULL,
|
|
UNIQUE KEY name (resource))""",
|
|
"archives" : """CREATE TABLE archives (
|
|
ID int(11) NOT NULL auto_increment,
|
|
resource varchar(255) BINARY NOT NULL,
|
|
title varchar(255) BINARY NOT NULL,
|
|
UNIQUE KEY id (ID,resource(255)),
|
|
INDEX (ID),
|
|
INDEX (resource))""",
|
|
"Queries" : """CREATE TABLE Queries (
|
|
ID int(11) NOT NULL auto_increment,
|
|
Value varchar(50) NOT NULL,
|
|
Count int(11) NOT NULL,
|
|
UNIQUE KEY id (ID,Value(35)),
|
|
INDEX (ID))""",
|
|
"AllQueries" : """CREATE TABLE AllQueries (
|
|
ID int(11) NOT NULL auto_increment,
|
|
Value varchar(50) NOT NULL,
|
|
Count int(11) NOT NULL,
|
|
UNIQUE KEY id (ID,Value(35)),
|
|
INDEX (ID))""",
|
|
}
|
|
|
|
#
|
|
# The XML API description file to parse
|
|
#
|
|
API="libvirt-api.xml"
|
|
DB=None
|
|
|
|
#########################################################################
|
|
# #
|
|
# MySQL database interfaces #
|
|
# #
|
|
#########################################################################
|
|
def createTable(db, name):
|
|
global TABLES
|
|
|
|
if db is None:
|
|
return -1
|
|
if name is None:
|
|
return -1
|
|
c = db.cursor()
|
|
|
|
ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
|
|
if ret == 1:
|
|
print "Removed table %s" % (name)
|
|
print "Creating table %s" % (name)
|
|
try:
|
|
ret = c.execute(TABLES[name])
|
|
except:
|
|
print "Failed to create table %s" % (name)
|
|
return -1
|
|
return ret
|
|
|
|
def checkTables(db, verbose = 1):
|
|
global TABLES
|
|
|
|
if db is None:
|
|
return -1
|
|
c = db.cursor()
|
|
nbtables = c.execute("show tables")
|
|
if verbose:
|
|
print "Found %d tables" % (nbtables)
|
|
tables = {}
|
|
i = 0
|
|
while i < nbtables:
|
|
l = c.fetchone()
|
|
name = l[0]
|
|
tables[name] = {}
|
|
i = i + 1
|
|
|
|
for table in TABLES.keys():
|
|
if not tables.has_key(table):
|
|
print "table %s missing" % (table)
|
|
createTable(db, table)
|
|
try:
|
|
ret = c.execute("SELECT count(*) from %s" % table)
|
|
row = c.fetchone()
|
|
if verbose:
|
|
print "Table %s contains %d records" % (table, row[0])
|
|
except:
|
|
print "Troubles with table %s : repairing" % (table)
|
|
ret = c.execute("repair table %s" % table)
|
|
print "repairing returned %d" % (ret)
|
|
ret = c.execute("SELECT count(*) from %s" % table)
|
|
row = c.fetchone()
|
|
print "Table %s contains %d records" % (table, row[0])
|
|
if verbose:
|
|
print "checkTables finished"
|
|
|
|
# make sure apache can access the tables read-only
|
|
try:
|
|
ret = c.execute("GRANT SELECT ON libvir.* TO nobody@localhost")
|
|
ret = c.execute("GRANT INSERT,SELECT,UPDATE ON libvir.Queries TO nobody@localhost")
|
|
except:
|
|
pass
|
|
return 0
|
|
|
|
def openMySQL(db="libvir", passwd=None, verbose = 1):
|
|
global DB
|
|
|
|
if passwd is None:
|
|
try:
|
|
passwd = os.environ["MySQL_PASS"]
|
|
except:
|
|
print "No password available, set environment MySQL_PASS"
|
|
sys.exit(1)
|
|
|
|
DB = MySQLdb.connect(passwd=passwd, db=db)
|
|
if DB is None:
|
|
return -1
|
|
ret = checkTables(DB, verbose)
|
|
return ret
|
|
|
|
def updateWord(name, symbol, relevance):
|
|
global DB
|
|
|
|
if DB is None:
|
|
openMySQL()
|
|
if DB is None:
|
|
return -1
|
|
if name is None:
|
|
return -1
|
|
if symbol is None:
|
|
return -1
|
|
|
|
c = DB.cursor()
|
|
try:
|
|
ret = c.execute(
|
|
"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
|
|
(name, symbol, relevance))
|
|
except:
|
|
try:
|
|
ret = c.execute(
|
|
"""UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
|
|
(relevance, name, symbol))
|
|
except:
|
|
print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
|
|
print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
|
|
print sys.exc_type, sys.exc_value
|
|
return -1
|
|
|
|
return ret
|
|
|
|
def updateSymbol(name, module, type, desc):
|
|
global DB
|
|
|
|
updateWord(name, name, 50)
|
|
if DB is None:
|
|
openMySQL()
|
|
if DB is None:
|
|
return -1
|
|
if name is None:
|
|
return -1
|
|
if module is None:
|
|
return -1
|
|
if type is None:
|
|
return -1
|
|
|
|
try:
|
|
desc = string.replace(desc, "'", " ")
|
|
l = string.split(desc, ".")
|
|
desc = l[0]
|
|
desc = desc[0:99]
|
|
except:
|
|
desc = ""
|
|
|
|
c = DB.cursor()
|
|
try:
|
|
ret = c.execute(
|
|
"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
|
|
(name, module, type, desc))
|
|
except:
|
|
try:
|
|
ret = c.execute(
|
|
"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
|
|
(module, type, desc, name))
|
|
except:
|
|
print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
|
|
print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
|
|
print sys.exc_type, sys.exc_value
|
|
return -1
|
|
|
|
return ret
|
|
|
|
def addFunction(name, module, desc = ""):
|
|
return updateSymbol(name, module, 'function', desc)
|
|
|
|
def addMacro(name, module, desc = ""):
|
|
return updateSymbol(name, module, 'macro', desc)
|
|
|
|
def addEnum(name, module, desc = ""):
|
|
return updateSymbol(name, module, 'enum', desc)
|
|
|
|
def addStruct(name, module, desc = ""):
|
|
return updateSymbol(name, module, 'struct', desc)
|
|
|
|
def addConst(name, module, desc = ""):
|
|
return updateSymbol(name, module, 'const', desc)
|
|
|
|
def addType(name, module, desc = ""):
|
|
return updateSymbol(name, module, 'type', desc)
|
|
|
|
def addFunctype(name, module, desc = ""):
|
|
return updateSymbol(name, module, 'functype', desc)
|
|
|
|
def addPage(resource, title):
|
|
global DB
|
|
|
|
if DB is None:
|
|
openMySQL()
|
|
if DB is None:
|
|
return -1
|
|
if resource is None:
|
|
return -1
|
|
|
|
c = DB.cursor()
|
|
try:
|
|
ret = c.execute(
|
|
"""INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
|
|
(resource, title))
|
|
except:
|
|
try:
|
|
ret = c.execute(
|
|
"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
|
|
(title, resource))
|
|
except:
|
|
print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
|
|
print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
|
|
print sys.exc_type, sys.exc_value
|
|
return -1
|
|
|
|
return ret
|
|
|
|
def updateWordHTML(name, resource, desc, id, relevance):
|
|
global DB
|
|
|
|
if DB is None:
|
|
openMySQL()
|
|
if DB is None:
|
|
return -1
|
|
if name is None:
|
|
return -1
|
|
if resource is None:
|
|
return -1
|
|
if id is None:
|
|
id = ""
|
|
if desc is None:
|
|
desc = ""
|
|
else:
|
|
try:
|
|
desc = string.replace(desc, "'", " ")
|
|
desc = desc[0:99]
|
|
except:
|
|
desc = ""
|
|
|
|
c = DB.cursor()
|
|
try:
|
|
ret = c.execute(
|
|
"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
|
|
(name, resource, desc, id, relevance))
|
|
except:
|
|
try:
|
|
ret = c.execute(
|
|
"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
|
|
(desc, id, relevance, name, resource))
|
|
except:
|
|
print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
|
|
print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
|
|
print sys.exc_type, sys.exc_value
|
|
return -1
|
|
|
|
return ret
|
|
|
|
def checkXMLMsgArchive(url):
|
|
global DB
|
|
|
|
if DB is None:
|
|
openMySQL()
|
|
if DB is None:
|
|
return -1
|
|
if url is None:
|
|
return -1
|
|
|
|
c = DB.cursor()
|
|
try:
|
|
ret = c.execute(
|
|
"""SELECT ID FROM archives WHERE resource='%s'""" % (url))
|
|
row = c.fetchone()
|
|
if row is None:
|
|
return -1
|
|
except:
|
|
return -1
|
|
|
|
return row[0]
|
|
|
|
def addXMLMsgArchive(url, title):
|
|
global DB
|
|
|
|
if DB is None:
|
|
openMySQL()
|
|
if DB is None:
|
|
return -1
|
|
if url is None:
|
|
return -1
|
|
if title is None:
|
|
title = ""
|
|
else:
|
|
title = string.replace(title, "'", " ")
|
|
title = title[0:99]
|
|
|
|
c = DB.cursor()
|
|
try:
|
|
cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
|
|
ret = c.execute(cmd)
|
|
cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
|
|
ret = c.execute(cmd)
|
|
row = c.fetchone()
|
|
if row is None:
|
|
print "addXMLMsgArchive failed to get the ID: %s" % (url)
|
|
return -1
|
|
except:
|
|
print "addXMLMsgArchive failed command: %s" % (cmd)
|
|
return -1
|
|
|
|
return((int)(row[0]))
|
|
|
|
def updateWordArchive(name, id, relevance):
|
|
global DB
|
|
|
|
if DB is None:
|
|
openMySQL()
|
|
if DB is None:
|
|
return -1
|
|
if name is None:
|
|
return -1
|
|
if id is None:
|
|
return -1
|
|
|
|
c = DB.cursor()
|
|
try:
|
|
ret = c.execute(
|
|
"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
|
|
(name, id, relevance))
|
|
except:
|
|
try:
|
|
ret = c.execute(
|
|
"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
|
|
(relevance, name, id))
|
|
except:
|
|
print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
|
|
print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
|
|
print sys.exc_type, sys.exc_value
|
|
return -1
|
|
|
|
return ret
|
|
|
|
#########################################################################
|
|
# #
|
|
# Word dictionary and analysis routines #
|
|
# #
|
|
#########################################################################
|
|
|
|
#
|
|
# top 100 english word without the one len < 3 + own set
|
|
#
|
|
dropWords = {
|
|
'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
|
|
'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
|
|
'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
|
|
'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
|
|
'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
|
|
'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
|
|
'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
|
|
'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
|
|
'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
|
|
'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
|
|
'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
|
|
'down':0,
|
|
'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
|
|
}
|
|
|
|
wordsDict = {}
|
|
wordsDictHTML = {}
|
|
wordsDictArchive = {}
|
|
|
|
def cleanupWordsString(str):
|
|
str = string.replace(str, ".", " ")
|
|
str = string.replace(str, "!", " ")
|
|
str = string.replace(str, "?", " ")
|
|
str = string.replace(str, ",", " ")
|
|
str = string.replace(str, "'", " ")
|
|
str = string.replace(str, '"', " ")
|
|
str = string.replace(str, ";", " ")
|
|
str = string.replace(str, "(", " ")
|
|
str = string.replace(str, ")", " ")
|
|
str = string.replace(str, "{", " ")
|
|
str = string.replace(str, "}", " ")
|
|
str = string.replace(str, "<", " ")
|
|
str = string.replace(str, ">", " ")
|
|
str = string.replace(str, "=", " ")
|
|
str = string.replace(str, "/", " ")
|
|
str = string.replace(str, "*", " ")
|
|
str = string.replace(str, ":", " ")
|
|
str = string.replace(str, "#", " ")
|
|
str = string.replace(str, "\\", " ")
|
|
str = string.replace(str, "\n", " ")
|
|
str = string.replace(str, "\r", " ")
|
|
str = string.replace(str, "\xc2", " ")
|
|
str = string.replace(str, "\xa0", " ")
|
|
return str
|
|
|
|
def cleanupDescrString(str):
|
|
str = string.replace(str, "'", " ")
|
|
str = string.replace(str, "\n", " ")
|
|
str = string.replace(str, "\r", " ")
|
|
str = string.replace(str, "\xc2", " ")
|
|
str = string.replace(str, "\xa0", " ")
|
|
l = string.split(str)
|
|
str = string.join(str)
|
|
return str
|
|
|
|
def splitIdentifier(str):
|
|
ret = []
|
|
while str != "":
|
|
cur = string.lower(str[0])
|
|
str = str[1:]
|
|
if ((cur < 'a') or (cur > 'z')):
|
|
continue
|
|
while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
|
|
cur = cur + string.lower(str[0])
|
|
str = str[1:]
|
|
while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
|
|
cur = cur + str[0]
|
|
str = str[1:]
|
|
while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
|
|
str = str[1:]
|
|
ret.append(cur)
|
|
return ret
|
|
|
|
def addWord(word, module, symbol, relevance):
|
|
global wordsDict
|
|
|
|
if word is None or len(word) < 3:
|
|
return -1
|
|
if module is None or symbol is None:
|
|
return -1
|
|
if dropWords.has_key(word):
|
|
return 0
|
|
if ord(word[0]) > 0x80:
|
|
return 0
|
|
|
|
if wordsDict.has_key(word):
|
|
d = wordsDict[word]
|
|
if d is None:
|
|
return 0
|
|
if len(d) > 500:
|
|
wordsDict[word] = None
|
|
return 0
|
|
try:
|
|
relevance = relevance + d[(module, symbol)]
|
|
except:
|
|
pass
|
|
else:
|
|
wordsDict[word] = {}
|
|
wordsDict[word][(module, symbol)] = relevance
|
|
return relevance
|
|
|
|
def addString(str, module, symbol, relevance):
|
|
if str is None or len(str) < 3:
|
|
return -1
|
|
ret = 0
|
|
str = cleanupWordsString(str)
|
|
l = string.split(str)
|
|
for word in l:
|
|
if len(word) > 2:
|
|
ret = ret + addWord(word, module, symbol, 5)
|
|
|
|
return ret
|
|
|
|
def addWordHTML(word, resource, id, section, relevance):
|
|
global wordsDictHTML
|
|
|
|
if word is None or len(word) < 3:
|
|
return -1
|
|
if resource is None or section is None:
|
|
return -1
|
|
if dropWords.has_key(word):
|
|
return 0
|
|
if ord(word[0]) > 0x80:
|
|
return 0
|
|
|
|
section = cleanupDescrString(section)
|
|
|
|
if wordsDictHTML.has_key(word):
|
|
d = wordsDictHTML[word]
|
|
if d is None:
|
|
print "skipped %s" % (word)
|
|
return 0
|
|
try:
|
|
(r,i,s) = d[resource]
|
|
if i is not None:
|
|
id = i
|
|
if s is not None:
|
|
section = s
|
|
relevance = relevance + r
|
|
except:
|
|
pass
|
|
else:
|
|
wordsDictHTML[word] = {}
|
|
d = wordsDictHTML[word]
|
|
d[resource] = (relevance, id, section)
|
|
return relevance
|
|
|
|
def addStringHTML(str, resource, id, section, relevance):
|
|
if str is None or len(str) < 3:
|
|
return -1
|
|
ret = 0
|
|
str = cleanupWordsString(str)
|
|
l = string.split(str)
|
|
for word in l:
|
|
if len(word) > 2:
|
|
try:
|
|
r = addWordHTML(word, resource, id, section, relevance)
|
|
if r < 0:
|
|
print "addWordHTML failed: %s %s" % (word, resource)
|
|
ret = ret + r
|
|
except:
|
|
print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
|
|
print sys.exc_type, sys.exc_value
|
|
|
|
return ret
|
|
|
|
def addWordArchive(word, id, relevance):
|
|
global wordsDictArchive
|
|
|
|
if word is None or len(word) < 3:
|
|
return -1
|
|
if id is None or id == -1:
|
|
return -1
|
|
if dropWords.has_key(word):
|
|
return 0
|
|
if ord(word[0]) > 0x80:
|
|
return 0
|
|
|
|
if wordsDictArchive.has_key(word):
|
|
d = wordsDictArchive[word]
|
|
if d is None:
|
|
print "skipped %s" % (word)
|
|
return 0
|
|
try:
|
|
r = d[id]
|
|
relevance = relevance + r
|
|
except:
|
|
pass
|
|
else:
|
|
wordsDictArchive[word] = {}
|
|
d = wordsDictArchive[word]
|
|
d[id] = relevance
|
|
return relevance
|
|
|
|
def addStringArchive(str, id, relevance):
|
|
if str is None or len(str) < 3:
|
|
return -1
|
|
ret = 0
|
|
str = cleanupWordsString(str)
|
|
l = string.split(str)
|
|
for word in l:
|
|
i = len(word)
|
|
if i > 2:
|
|
try:
|
|
r = addWordArchive(word, id, relevance)
|
|
if r < 0:
|
|
print "addWordArchive failed: %s %s" % (word, id)
|
|
else:
|
|
ret = ret + r
|
|
except:
|
|
print "addWordArchive failed: %s %s %d" % (word, id, relevance)
|
|
print sys.exc_type, sys.exc_value
|
|
return ret
|
|
|
|
#########################################################################
|
|
# #
|
|
# XML API description analysis #
|
|
# #
|
|
#########################################################################
|
|
|
|
def loadAPI(filename):
|
|
doc = libxml2.parseFile(filename)
|
|
print "loaded %s" % (filename)
|
|
return doc
|
|
|
|
def foundExport(file, symbol):
|
|
if file is None:
|
|
return 0
|
|
if symbol is None:
|
|
return 0
|
|
addFunction(symbol, file)
|
|
l = splitIdentifier(symbol)
|
|
for word in l:
|
|
addWord(word, file, symbol, 10)
|
|
return 1
|
|
|
|
def analyzeAPIFile(top):
|
|
count = 0
|
|
name = top.prop("name")
|
|
cur = top.children
|
|
while cur is not None:
|
|
if cur.type == 'text':
|
|
cur = cur.next
|
|
continue
|
|
if cur.name == "exports":
|
|
count = count + foundExport(name, cur.prop("symbol"))
|
|
else:
|
|
print "unexpected element %s in API doc <file name='%s'>" % (name)
|
|
cur = cur.next
|
|
return count
|
|
|
|
def analyzeAPIFiles(top):
|
|
count = 0
|
|
cur = top.children
|
|
|
|
while cur is not None:
|
|
if cur.type == 'text':
|
|
cur = cur.next
|
|
continue
|
|
if cur.name == "file":
|
|
count = count + analyzeAPIFile(cur)
|
|
else:
|
|
print "unexpected element %s in API doc <files>" % (cur.name)
|
|
cur = cur.next
|
|
return count
|
|
|
|
def analyzeAPIEnum(top):
|
|
file = top.prop("file")
|
|
if file is None:
|
|
return 0
|
|
symbol = top.prop("name")
|
|
if symbol is None:
|
|
return 0
|
|
|
|
addEnum(symbol, file)
|
|
l = splitIdentifier(symbol)
|
|
for word in l:
|
|
addWord(word, file, symbol, 10)
|
|
|
|
return 1
|
|
|
|
def analyzeAPIConst(top):
|
|
file = top.prop("file")
|
|
if file is None:
|
|
return 0
|
|
symbol = top.prop("name")
|
|
if symbol is None:
|
|
return 0
|
|
|
|
addConst(symbol, file)
|
|
l = splitIdentifier(symbol)
|
|
for word in l:
|
|
addWord(word, file, symbol, 10)
|
|
|
|
return 1
|
|
|
|
def analyzeAPIType(top):
|
|
file = top.prop("file")
|
|
if file is None:
|
|
return 0
|
|
symbol = top.prop("name")
|
|
if symbol is None:
|
|
return 0
|
|
|
|
addType(symbol, file)
|
|
l = splitIdentifier(symbol)
|
|
for word in l:
|
|
addWord(word, file, symbol, 10)
|
|
return 1
|
|
|
|
def analyzeAPIFunctype(top):
|
|
file = top.prop("file")
|
|
if file is None:
|
|
return 0
|
|
symbol = top.prop("name")
|
|
if symbol is None:
|
|
return 0
|
|
|
|
addFunctype(symbol, file)
|
|
l = splitIdentifier(symbol)
|
|
for word in l:
|
|
addWord(word, file, symbol, 10)
|
|
return 1
|
|
|
|
def analyzeAPIStruct(top):
|
|
file = top.prop("file")
|
|
if file is None:
|
|
return 0
|
|
symbol = top.prop("name")
|
|
if symbol is None:
|
|
return 0
|
|
|
|
addStruct(symbol, file)
|
|
l = splitIdentifier(symbol)
|
|
for word in l:
|
|
addWord(word, file, symbol, 10)
|
|
|
|
info = top.prop("info")
|
|
if info is not None:
|
|
info = string.replace(info, "'", " ")
|
|
info = string.strip(info)
|
|
l = string.split(info)
|
|
for word in l:
|
|
if len(word) > 2:
|
|
addWord(word, file, symbol, 5)
|
|
return 1
|
|
|
|
def analyzeAPIMacro(top):
|
|
file = top.prop("file")
|
|
if file is None:
|
|
return 0
|
|
symbol = top.prop("name")
|
|
if symbol is None:
|
|
return 0
|
|
symbol = string.replace(symbol, "'", " ")
|
|
symbol = string.strip(symbol)
|
|
|
|
info = None
|
|
cur = top.children
|
|
while cur is not None:
|
|
if cur.type == 'text':
|
|
cur = cur.next
|
|
continue
|
|
if cur.name == "info":
|
|
info = cur.content
|
|
break
|
|
cur = cur.next
|
|
|
|
l = splitIdentifier(symbol)
|
|
for word in l:
|
|
addWord(word, file, symbol, 10)
|
|
|
|
if info is None:
|
|
addMacro(symbol, file)
|
|
print "Macro %s description has no <info>" % (symbol)
|
|
return 0
|
|
|
|
info = string.replace(info, "'", " ")
|
|
info = string.strip(info)
|
|
addMacro(symbol, file, info)
|
|
l = string.split(info)
|
|
for word in l:
|
|
if len(word) > 2:
|
|
addWord(word, file, symbol, 5)
|
|
return 1
|
|
|
|
def analyzeAPIFunction(top):
|
|
file = top.prop("file")
|
|
if file is None:
|
|
return 0
|
|
symbol = top.prop("name")
|
|
if symbol is None:
|
|
return 0
|
|
|
|
symbol = string.replace(symbol, "'", " ")
|
|
symbol = string.strip(symbol)
|
|
info = None
|
|
cur = top.children
|
|
while cur is not None:
|
|
if cur.type == 'text':
|
|
cur = cur.next
|
|
continue
|
|
if cur.name == "info":
|
|
info = cur.content
|
|
elif cur.name == "return":
|
|
rinfo = cur.prop("info")
|
|
if rinfo is not None:
|
|
rinfo = string.replace(rinfo, "'", " ")
|
|
rinfo = string.strip(rinfo)
|
|
addString(rinfo, file, symbol, 7)
|
|
elif cur.name == "arg":
|
|
ainfo = cur.prop("info")
|
|
if ainfo is not None:
|
|
ainfo = string.replace(ainfo, "'", " ")
|
|
ainfo = string.strip(ainfo)
|
|
addString(ainfo, file, symbol, 5)
|
|
name = cur.prop("name")
|
|
if name is not None:
|
|
name = string.replace(name, "'", " ")
|
|
name = string.strip(name)
|
|
addWord(name, file, symbol, 7)
|
|
cur = cur.next
|
|
if info is None:
|
|
print "Function %s description has no <info>" % (symbol)
|
|
addFunction(symbol, file, "")
|
|
else:
|
|
info = string.replace(info, "'", " ")
|
|
info = string.strip(info)
|
|
addFunction(symbol, file, info)
|
|
addString(info, file, symbol, 5)
|
|
|
|
l = splitIdentifier(symbol)
|
|
for word in l:
|
|
addWord(word, file, symbol, 10)
|
|
|
|
return 1
|
|
|
|
def analyzeAPISymbols(top):
|
|
count = 0
|
|
cur = top.children
|
|
|
|
while cur is not None:
|
|
if cur.type == 'text':
|
|
cur = cur.next
|
|
continue
|
|
if cur.name == "macro":
|
|
count = count + analyzeAPIMacro(cur)
|
|
elif cur.name == "function":
|
|
count = count + analyzeAPIFunction(cur)
|
|
elif cur.name == "const":
|
|
count = count + analyzeAPIConst(cur)
|
|
elif cur.name == "typedef":
|
|
count = count + analyzeAPIType(cur)
|
|
elif cur.name == "struct":
|
|
count = count + analyzeAPIStruct(cur)
|
|
elif cur.name == "enum":
|
|
count = count + analyzeAPIEnum(cur)
|
|
elif cur.name == "functype":
|
|
count = count + analyzeAPIFunctype(cur)
|
|
else:
|
|
print "unexpected element %s in API doc <files>" % (cur.name)
|
|
cur = cur.next
|
|
return count
|
|
|
|
def analyzeAPI(doc):
|
|
count = 0
|
|
if doc is None:
|
|
return -1
|
|
root = doc.getRootElement()
|
|
if root.name != "api":
|
|
print "Unexpected root name"
|
|
return -1
|
|
cur = root.children
|
|
while cur is not None:
|
|
if cur.type == 'text':
|
|
cur = cur.next
|
|
continue
|
|
if cur.name == "files":
|
|
pass
|
|
# count = count + analyzeAPIFiles(cur)
|
|
elif cur.name == "symbols":
|
|
count = count + analyzeAPISymbols(cur)
|
|
else:
|
|
print "unexpected element %s in API doc" % (cur.name)
|
|
cur = cur.next
|
|
return count
|
|
|
|
#########################################################################
|
|
# #
|
|
# Web pages parsing and analysis #
|
|
# #
|
|
#########################################################################
|
|
|
|
import glob
|
|
|
|
def analyzeHTMLText(doc, resource, p, section, id):
|
|
words = 0
|
|
try:
|
|
content = p.content
|
|
words = words + addStringHTML(content, resource, id, section, 5)
|
|
except:
|
|
return -1
|
|
return words
|
|
|
|
def analyzeHTMLPara(doc, resource, p, section, id):
|
|
words = 0
|
|
try:
|
|
content = p.content
|
|
words = words + addStringHTML(content, resource, id, section, 5)
|
|
except:
|
|
return -1
|
|
return words
|
|
|
|
def analyzeHTMLPre(doc, resource, p, section, id):
|
|
words = 0
|
|
try:
|
|
content = p.content
|
|
words = words + addStringHTML(content, resource, id, section, 5)
|
|
except:
|
|
return -1
|
|
return words
|
|
|
|
def analyzeHTML(doc, resource, p, section, id):
|
|
words = 0
|
|
try:
|
|
content = p.content
|
|
words = words + addStringHTML(content, resource, id, section, 5)
|
|
except:
|
|
return -1
|
|
return words
|
|
|
|
def analyzeHTML(doc, resource):
|
|
para = 0
|
|
ctxt = doc.xpathNewContext()
|
|
try:
|
|
res = ctxt.xpathEval("//head/title")
|
|
title = res[0].content
|
|
except:
|
|
title = "Page %s" % (resource)
|
|
addPage(resource, title)
|
|
try:
|
|
items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
|
|
section = title
|
|
id = ""
|
|
for item in items:
|
|
if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
|
|
section = item.content
|
|
if item.prop("id"):
|
|
id = item.prop("id")
|
|
elif item.prop("name"):
|
|
id = item.prop("name")
|
|
elif item.type == 'text':
|
|
analyzeHTMLText(doc, resource, item, section, id)
|
|
para = para + 1
|
|
elif item.name == 'p':
|
|
analyzeHTMLPara(doc, resource, item, section, id)
|
|
para = para + 1
|
|
elif item.name == 'pre':
|
|
analyzeHTMLPre(doc, resource, item, section, id)
|
|
para = para + 1
|
|
else:
|
|
print "Page %s, unexpected %s element" % (resource, item.name)
|
|
except:
|
|
print "Page %s: problem analyzing" % (resource)
|
|
print sys.exc_type, sys.exc_value
|
|
|
|
return para
|
|
|
|
def analyzeHTMLPages():
|
|
ret = 0
|
|
HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html") + \
|
|
glob.glob("CIM/*.html") + glob.glob("ocaml/*.html") + \
|
|
glob.glob("ruby/*.html")
|
|
for html in HTMLfiles:
|
|
if html[0:3] == "API":
|
|
continue
|
|
if html == "xml.html":
|
|
continue
|
|
try:
|
|
doc = libxml2.parseFile(html)
|
|
except:
|
|
doc = libxml2.htmlParseFile(html, None)
|
|
try:
|
|
res = analyzeHTML(doc, html)
|
|
print "Parsed %s : %d paragraphs" % (html, res)
|
|
ret = ret + 1
|
|
except:
|
|
print "could not parse %s" % (html)
|
|
return ret
|
|
|
|
#########################################################################
|
|
# #
|
|
# Mail archives parsing and analysis #
|
|
# #
|
|
#########################################################################
|
|
|
|
import time
|
|
|
|
def getXMLDateArchive(t = None):
|
|
if t is None:
|
|
t = time.time()
|
|
T = time.gmtime(t)
|
|
month = time.strftime("%B", T)
|
|
year = T[0]
|
|
url = "http://www.redhat.com/archives/libvir-list/%d-%s/date.html" % (year, month)
|
|
return url
|
|
|
|
def scanXMLMsgArchive(url, title, force = 0):
|
|
if url is None or title is None:
|
|
return 0
|
|
|
|
ID = checkXMLMsgArchive(url)
|
|
if force == 0 and ID != -1:
|
|
return 0
|
|
|
|
if ID == -1:
|
|
ID = addXMLMsgArchive(url, title)
|
|
if ID == -1:
|
|
return 0
|
|
|
|
try:
|
|
print "Loading %s" % (url)
|
|
doc = libxml2.htmlParseFile(url, None)
|
|
except:
|
|
doc = None
|
|
if doc is None:
|
|
print "Failed to parse %s" % (url)
|
|
return 0
|
|
|
|
addStringArchive(title, ID, 20)
|
|
ctxt = doc.xpathNewContext()
|
|
texts = ctxt.xpathEval("//pre//text()")
|
|
for text in texts:
|
|
addStringArchive(text.content, ID, 5)
|
|
|
|
return 1
|
|
|
|
def scanXMLDateArchive(t = None, force = 0):
|
|
global wordsDictArchive
|
|
|
|
wordsDictArchive = {}
|
|
|
|
url = getXMLDateArchive(t)
|
|
print "loading %s" % (url)
|
|
try:
|
|
doc = libxml2.htmlParseFile(url, None)
|
|
except:
|
|
doc = None
|
|
if doc is None:
|
|
print "Failed to parse %s" % (url)
|
|
return -1
|
|
ctxt = doc.xpathNewContext()
|
|
anchors = ctxt.xpathEval("//a[@href]")
|
|
links = 0
|
|
newmsg = 0
|
|
for anchor in anchors:
|
|
href = anchor.prop("href")
|
|
if href is None or href[0:3] != "msg":
|
|
continue
|
|
try:
|
|
links = links + 1
|
|
|
|
msg = libxml2.buildURI(href, url)
|
|
title = anchor.content
|
|
if title is not None and title[0:4] == 'Re: ':
|
|
title = title[4:]
|
|
if title is not None and title[0:6] == '[xml] ':
|
|
title = title[6:]
|
|
newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
|
|
|
|
except:
|
|
pass
|
|
|
|
return newmsg
|
|
|
|
|
|
#########################################################################
|
|
# #
|
|
# Main code: open the DB, the API XML and analyze it #
|
|
# #
|
|
#########################################################################
|
|
def analyzeArchives(t = None, force = 0):
|
|
global wordsDictArchive
|
|
|
|
ret = scanXMLDateArchive(t, force)
|
|
print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
|
|
|
|
i = 0
|
|
skipped = 0
|
|
for word in wordsDictArchive.keys():
|
|
refs = wordsDictArchive[word]
|
|
if refs is None:
|
|
skipped = skipped + 1
|
|
continue
|
|
for id in refs.keys():
|
|
relevance = refs[id]
|
|
updateWordArchive(word, id, relevance)
|
|
i = i + 1
|
|
|
|
print "Found %d associations in HTML pages" % (i)
|
|
|
|
def analyzeHTMLTop():
|
|
global wordsDictHTML
|
|
|
|
ret = analyzeHTMLPages()
|
|
print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
|
|
|
|
i = 0
|
|
skipped = 0
|
|
for word in wordsDictHTML.keys():
|
|
refs = wordsDictHTML[word]
|
|
if refs is None:
|
|
skipped = skipped + 1
|
|
continue
|
|
for resource in refs.keys():
|
|
(relevance, id, section) = refs[resource]
|
|
updateWordHTML(word, resource, section, id, relevance)
|
|
i = i + 1
|
|
|
|
print "Found %d associations in HTML pages" % (i)
|
|
|
|
def analyzeAPITop():
|
|
global wordsDict
|
|
global API
|
|
|
|
try:
|
|
doc = loadAPI(API)
|
|
ret = analyzeAPI(doc)
|
|
print "Analyzed %d blocs" % (ret)
|
|
doc.freeDoc()
|
|
except:
|
|
print "Failed to parse and analyze %s" % (API)
|
|
print sys.exc_type, sys.exc_value
|
|
sys.exit(1)
|
|
|
|
print "Indexed %d words" % (len(wordsDict))
|
|
i = 0
|
|
skipped = 0
|
|
for word in wordsDict.keys():
|
|
refs = wordsDict[word]
|
|
if refs is None:
|
|
skipped = skipped + 1
|
|
continue
|
|
for (module, symbol) in refs.keys():
|
|
updateWord(word, symbol, refs[(module, symbol)])
|
|
i = i + 1
|
|
|
|
print "Found %d associations, skipped %d words" % (i, skipped)
|
|
|
|
def usage():
|
|
print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]"
|
|
sys.exit(1)
|
|
|
|
def main():
|
|
try:
|
|
openMySQL()
|
|
except:
|
|
print "Failed to open the database"
|
|
print sys.exc_type, sys.exc_value
|
|
sys.exit(1)
|
|
|
|
args = sys.argv[1:]
|
|
force = 0
|
|
if args:
|
|
i = 0
|
|
while i < len(args):
|
|
if args[i] == '--force':
|
|
force = 1
|
|
elif args[i] == '--archive':
|
|
analyzeArchives(None, force)
|
|
elif args[i] == '--archive-year':
|
|
i = i + 1
|
|
year = args[i]
|
|
months = ["January" , "February", "March", "April", "May",
|
|
"June", "July", "August", "September", "October",
|
|
"November", "December"]
|
|
for month in months:
|
|
try:
|
|
str = "%s-%s" % (year, month)
|
|
T = time.strptime(str, "%Y-%B")
|
|
t = time.mktime(T) + 3600 * 24 * 10
|
|
analyzeArchives(t, force)
|
|
except:
|
|
print "Failed to index month archive:"
|
|
print sys.exc_type, sys.exc_value
|
|
elif args[i] == '--archive-month':
|
|
i = i + 1
|
|
month = args[i]
|
|
try:
|
|
T = time.strptime(month, "%Y-%B")
|
|
t = time.mktime(T) + 3600 * 24 * 10
|
|
analyzeArchives(t, force)
|
|
except:
|
|
print "Failed to index month archive:"
|
|
print sys.exc_type, sys.exc_value
|
|
elif args[i] == '--API':
|
|
analyzeAPITop()
|
|
elif args[i] == '--docs':
|
|
analyzeHTMLTop()
|
|
else:
|
|
usage()
|
|
i = i + 1
|
|
else:
|
|
usage()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|