#!/usr/bin/env python # # Creative Commons Attribution License # http://creativecommons.org/licenses/by/2.5/ # # Trevor Strohman, 2006 # import sys class QueryNode: def __str__(self): return "" def transform(self, f): pass def isBeliefNode(self): return 0 class TextQueryNode(QueryNode): def __init__(self, text): self.text = text def __str__(self): return self.text def isBeliefNode(self): return 0 def transform(self, visitor): visitor.before(self) return visitor.after(self, TextQueryNode(self.text)) class FieldRestrictionNode(QueryNode): def __init__(self, child, fieldName): self.child = child self.fieldName = fieldName def __str__(self): return "%s.%s" % (self.child, self.fieldName) def transform(self, visitor): if visitor.before(self): xchild = child.transform(visitor) else: xchild = child return child.transform(self, FieldRestrictionNode(xchild, self.fieldName)) def isBeliefNode(self): return 0 class OperatorQueryNode(QueryNode): def __init__(self, type, nodes): self.type = type self.nodes = nodes def __str__(self): return "#%s(%s)" % (self.type, " ".join(map(str,self.nodes))) def isBeliefNode(self): if (self.type == "syn" or self.type[0].isdigit() or self.type.startswith("od") or self.type.startswith("uw")): return 0 return 1 def transform(self, visitor): if visitor.before(self): transformedNodes = map(lambda x: x.transform(visitor), self.nodes) else: transformedNodes = self.nodes return visitor.after(self, OperatorQueryNode(self.type, transformedNodes)) class WeightedQueryNode(QueryNode): def __init__(self, type, weightedNodes): self.type = type self.weightedNodes = weightedNodes self.nodes = map(lambda x: x[1], weightedNodes) def __str__(self): def pairString(x): return "%f %s" % (x[0], x[1]) return "#%s(%s)" % (self.type, " ".join(map(pairString, self.weightedNodes))) def isBeliefNode(self): return self.type not in [ "wsyn" ] def transform(self, visitor): if visitor.before(self): transformedNodes = map(lambda x: (x[0], x[1].transform(visitor)), self.weightedNodes) else: transformedNodes = self.weightedNodes return visitor.after(self, WeightedQueryNode(self.type, transformedNodes)) class NodeVisitor: def before(self, node): return 0 def after(self, oldNode, newNode): return newNode class FullCopier(NodeVisitor): def before(self, node): return 1 class WeightFoldingCopier(NodeVisitor): pass class FieldWeighting(NodeVisitor): def __init__(self, fieldSet, operator="wsum"): self.operator = operator self.fieldSet = fieldSet def hasAllProximityChildren(self, node): if node.isBeliefNode(): beliefChildren = filter(lambda x: x.isBeliefNode(), node.nodes) return len(beliefChildren) == 0 return 0 def before(self, node): # only transform children if this is a belief node return node.isBeliefNode() def after(self, oldNode, newNode): if newNode.isBeliefNode() == 0: nodes = [] for field in self.fieldSet: name = field[1] weight = field[0] nodes.append( (weight, FieldRestrictionNode(newNode, name)) ) return WeightedQueryNode(self.operator, nodes) else: return newNode class AcronymExpansion(NodeVisitor): """Finds terms that appear to be acronyms (that have periods in them) and expands them two ways. First, it removes the periods to make a single term that's smashed together. Second, it splits on the period and adds a phrase operator. For a term like I.B.M., the result is #syn( ibm #1(i b m) )""" def after(self, oldNode, newNode): if type(newNode) is TextQueryNode and newNode.text.contains('.'): text = newNode.text firstNode = TextQueryNode( text.replace('.', '') ) secondNode = OperatorQueryNode( "od1", map(TextQueryNode, text.split('.')) ) subNodes = [firstNode, secondNode] return OperatorQueryNode( "syn", subNodes ) else: return newNode class DependenceModel(NodeVisitor): def __init__(self): self.singleWeight = 0.8 self.orderedWeight = 0.1 self.unorderedWeight = 0.1 def before(self, node): # only transform children if this is a belief node return node.isBeliefNode() def allTuples(self, nodes): if len(nodes) == 0: return [] elif len(nodes) == 1: return [nodes] else: current = nodes[0] # get all tuples for nodes[1:] remaining = self.allTuples(nodes[1:]) # now, for each one, add nodes[0] to it to form prepend prepend = [ [current] + r for r in remaining ] return [[current]] + prepend + remaining def orderedTuples(self, nodes): if len(nodes) == 0: return [] elif len(nodes) == 1: return [nodes] else: current = nodes[0] next = nodes[1] # get all ordered tuples for the remaining nodes remaining = self.orderedTuples(nodes[1:]) # add the current node to all remaining tuples that are contiguous prepend = [ [current] + r for r in remaining if len(r) and r[0] == next ] return [[current]] + prepend + remaining def after(self, old, node): # if this is a weighted combination node but all children are prox nodes, do it if node.isBeliefNode(): beliefChildren = filter(lambda x: x.isBeliefNode(), node.nodes) if len(beliefChildren) == 0: singles = list(node.nodes) ordered = filter( lambda x: len(x) > 1, self.orderedTuples(node.nodes) ) orderedFeatures = map( lambda x: OperatorQueryNode("1", x), ordered ) unordered = filter( lambda x: len(x) > 1, self.allTuples(node.nodes) ) unorderedFeatures = map( lambda x: OperatorQueryNode( "uw%d" % (len(x)*4), x ), unordered ) children = [ (self.singleWeight, OperatorQueryNode("combine", singles)), (self.orderedWeight, OperatorQueryNode("combine", orderedFeatures)), (self.unorderedWeight, OperatorQueryNode("combine", unorderedFeatures)) ] return WeightedQueryNode("weight", children) return node class MetatermsExpansion(NodeVisitor): pass def simple_parse(s): return OperatorQueryNode("combine", map(lambda x: TextQueryNode(x), s.split())) if __name__ == "__main__": query = simple_parse(sys.argv[1]) print query print "---" print query.transform( DependenceModel() )