# # fielded # # Creative Commons Attribution License # http://creativecommons.org/licenses/by/2.5/ # # Trevor Strohman # First release: 10 February 2006 # """Allows simple functional/relational access to multicolumn text files. In a usual use case, you'd use read() to read in a text file, then use the other functions to filter, project, aggregate or join this data. The final result is either a list of tuples, or in the case of aggregate, a dictionary mapping one of the columns to the rest of the tuple. An example use case follows. Here, we're reading in a standard TREC qrels file, which has lines that look like this: 301 0 WTX001-B01-001 1 The first entry of the line is the query number, the second number is typically unused, the third element is a document identifier, and the last number indicates the relevance judgment for the document. # read the data lines = fielded.read( "qrels" ) # only keep relevant documents lines = filter( lambda x: (int(x[3]) > 0), lines ) # remove excess information, leaving query number and document numbers lines = fielded.project( lines, [0, 2] ) # make a dictionary that maps query numbers to the positive relevance judged documents result = fielded.aggregate( lines, 1 ) """ def read( fn ): """Reads a text file, splits each line into fields, and returns a list of tuples. Lines beginning with a hash mark are considered comments and are skipped. The argument can be either a filename or a file handle.""" results = [] if type(fn) == str: fn = file(fn) for line in fn: if line.startswith("#"): continue results.append(line.strip().split()) fn.close() return results def project( lines, indexes ): """Removes all columns in lines that are not in the indexes array.""" result = [] for line in lines: t = [] for index in indexes: t.append( line[index] ) result.append( t ) return result def permute( lines, indexes ): """Alternate name for project.""" return project( lines, indexes ) def aggregate( lines, keyIndex ): result = {} for line in lines: key = line[keyIndex] if key not in result.keys(): result[key] = [] fields = line[:keyIndex] + line[keyIndex+1:] if len(fields) == 1: fields = fields[0] result[key].append( fields ) return result def join( left, right, joinKeys ): # build up the key array leftKeys = [] rightKeys = [] if type(joinKeys) == int: joinKey = joinKeys joinKeys = [(joinKey, joinKey)] for i in range(0,len(joinKeys)): if type(joinKeys[i]) == int: joinKey = joinKeys[i] joinKeys[i] = (joinKey, joinKey) leftKeys = [ l for l, r in joinKeys ] rightKeys = [ r for l, r, in joinKeys ] # build key def build_key( t, indexes ): return tuple( [t[i] for i in indexes] ) # make some convenience methods def build_hash( lines, keys ): # this builds a hash table for the smaller table on the join keys # which allows the join process to go quickly result = {} for i in range(0, len(lines)): key = build_key( lines[i], keys ) if key not in result: result[key] = [] result[key].append(i) return result def perform_join( lines, keys, table, otherLines, join_row ): # we iterate over the rows in the bigger table, looking # for matches in the hash table (which is built on the small table) # the join_row function specifies the join order for the rows result = [] for line in lines: key = build_key( line, keys ) if key in table: matches = table[key] for match in matches: result.append( join_row( line, otherLines[match] ) ) return result # finally, perform the join if len(left) < len(right): return perform_join( left, leftKeys, build_hash( right, rightKeys ), right, lambda x, y: x + y ) else: return perform_join( right, rightKeys, build_hash( left, leftKeys ), left, lambda x, y: y + x )