File: //usr/lib64/python2.6/site-packages/mx/Misc/CSV.py
""" CSV (Comma Separated Values) table format reader/writer.
The two classes read and write CSV data. It's a fairly simple
format for data exchange with nearly all spreadsheets, databases,
organizers, etc.
The reader class is built to be easy on the data passed in: small
errors like missing commas don't cause an exception (but do set a
variable to indicate this). Data is converted from CSV text into a
an internal format which can then be extracted into different
forms of table representation.
The writer writes standard CSV files and knows about quoting
rules, separator handling etc. so that interfacing to spreadsheets
and databases should pose no problem.
Both classes can be subclassed to enhance/modify their behaviour.
Copyright (c) 2000-2005, Marc-Andre Lemburg; mailto:mal@lemburg.com
Copyright (c) 2000-2008, eGenix.com Software GmbH; mailto:info@egenix.com
See the documentation for further information on copyrights,
or contact the author. All Rights Reserved.
"""
__version__ = '1.0'
_debug = 0
import re,string,sys,types,exceptions
import mx.Tools.NewBuiltins
from mx import TextTools
### Errors
class Error(exceptions.StandardError):
pass
### Reader base class
class Reader:
# Did decoding have errors ?
errors = 0
# If there were errors, then this list contains the line number of
# the lines with errors
errorlines = None
# List of lines, each being a list of strings
lines = None
# Width of the received data (max. number of entries per line)
width = 0
# List of column names found in the data's first row; possibly
# filtered through .filter_header()
columns = None
# String of separator characters which are used to separate items
# on an input line
separators = ',;\t'
# My version, which runs in exponential time for '"abcdefgh...'
# parseitem = re.compile(r'"([^"]+|"")*"|[^",;\015\012]*')
# Thanks to Andrew Kuchling for helping me with this "simple" RE
parseitem = re.compile(r'"([^"]|"")*"|[^"' + separators + r'\015\012]*')
def __init__(self, separators=None):
if separators is not None:
self.parseitem = re.compile(r'"([^"]|"")*"|[^"'
+ separators + r'\015\012]*')
def flush(self):
""" Empty the object and reset errors
"""
self.lines = None
self.width = 0
self.errors = 0
self.columns = None
def load(self, file, header=1, columns=None,
StringType=types.StringType):
""" Read a file.
If header is true (default), the first line of input is
interpreted as list of column names.
If columns is given as list of strings, these columns
names are used.
If both header and columns are used, columns overrides the
columns set by reading the header line. This is useful to
override header information from the input data.
"""
if type(file) == StringType:
file = open(file,'rb')
text = file.read()
file.close()
self.lines = self._decode(text)
if header:
self.process_header()
if columns is not None:
self.set_columns(columns)
def loads(self, text, header=1, columns=None):
""" Read the given text
If header is true (default), the first line of input is
interpreted as list of column names.
If columns is given as list of strings, these columns
names are used.
If both header and columns are used, columns overrides the
columns set by reading the header line. This is useful to
override header information from the input data.
"""
self.lines = self._decode(text)
if header:
self.process_header()
if columns is not None:
self.set_columns(columns)
def list(self):
""" Return the current data as list of lists, each having
self.width string entries.
Missing entries are set to None.
"""
width = self.width
lines = self.lines
table = [None] * len(lines)
for i, row in irange(lines):
row = row[:]
if len(row) < width:
row[len(row):] = [None]*(width-len(row))
table[i] = row
return table
def dictionary(self):
""" Return the current data as dictionary of lists of strings,
with one entry for each column.
.columns must have been set using .set_columns() or by
processing a given CSV header.
"""
table = {}
lines = self.lines
keys = self.columns
if keys is None:
raise Error,'no columns set'
rows = len(lines)
for k in keys:
table[k] = [None] * rows
for i, key in irange(keys):
column = table[key]
for j, row in irange(lines):
if len(row) > i:
column[j] = row[i]
return table
def objects(self,constructor):
""" Builds a list of objects by calling the given constructor
with keywords defined by mapping column names to values for
each input line.
.columns must have been set using .set_columns() or by
processing a given CSV header.
"""
lines = self.lines
keys = self.columns
if keys is None:
raise Error,'no columns set'
objs = [None] * len(lines)
for i,line in lines:
kws = dict(tuples(keys, line))
objs[i] = apply(constructor,(),kws)
return l
def process_header(self):
""" Process the header data.
This also sets the .columns attribute. The header is
removed from the data in .lines after having been
processed.
The header is passed through .filter_header() before
interpreting it as list of column names.
"""
lines = self.lines
if len(lines) < 1:
raise Error,'missing header data'
self.columns = self.filter_header(lines[0])
del lines[0]
def set_columns(self, columns):
""" Sets the column names to use.
This overrides any column names possibly given in the read
data.
"""
self.columns = columns
def filter_header(self, header,
lower=TextTools.lower,split=string.split,
join=string.join):
""" Filter the given header line.
The base class converts the column names to all lowercase
and removes any whitespace included in the header.
This method is only called in case the header was read
from the data provided to the object.
"""
l = [''] * len(header)
for i,column in irange(header):
l[i] = join(split(lower(column)),'')
return l
def description(self, header=1):
""" Return a list of tuples (column name, max length) found in the
data.
If header is true (default), the column names themselves
are included in the calculation.
"""
lines = self.lines
columns = self.columns
width = len(columns)
if header:
lengths = []
for column in columns:
lengths.append(len(column))
else:
lengths = [0] * width
for row in self.lines:
for i,o in irange(row[:width]):
if len(o) > lengths[i]:
lengths[i] = len(o)
return map(None,columns,lengths)
def _decode(self,text,
find=string.find):
""" Decode the CSV data in text.
Internal method. Do not use directly.
"""
lines = []
x = 0
length = len(text)
width = 0
errorlines = []
match = self.parseitem.match
separators = self.separators
while x < length:
l = []
while x <= length:
# Find next token
m = match(text,x)
if not m:
# End of line
break
y = m.regs[0][1]
l.append(text[x:y])
if _debug:
print x,repr(l[-1])
x = y + 1
if x > length:
break
# Check validity
if text[y:x] not in separators:
if text[y:y+2] == '\015\012':
# Handle CRLF
x = y + 2
elif text[y:x] not in '\015\012':
# Syntax error: missing ',' or ';'
# Action: skip to end of line
y = find(text,'\012',x)
if y < 0:
y = find(test,'\015',x)
if y < 0:
# Skip to end of text
x = length
else:
x = y + 1
if _debug:
print 'errors in',l,x
errorlines.append(len(lines))
# else: found single CR or LF
break
if len(l) > width:
width = len(l)
if _debug:
print 'adding',l,x,repr(text[x:x+5])
lines.append(l)
self.width = width
if errorlines:
self.errors = 1
self.errorlines = errorlines
return map(self._unquote,lines)
def _unquote(self,line,
replace=string.replace):
""" Unquote a CSV style quoted line of text.
Internal method. Do not use directly.
"""
for i,text in irange(line):
if text[:1] == '"' and text[-1:] == '"':
text = text[1:-1]
line[i] = replace(text,'""','"')
return line
def __len__(self):
return len(self.lines)
def __str__(self):
lines = self.list()
desc = self.description()
width = 0
output = []
write = output.append
for col in desc:
write('%-*s|' % (col[1],col[0]))
write('\n')
for col in desc:
write('=' * col[1] + '+')
write('\n')
for line in lines:
for i,item in irange(line):
write('%-*s|' % (desc[i][1],item))
write('\n')
return string.join(output,'')
### Writer base class
class Writer:
# Column names
columns = None
# CSV text
text = ''
# Separator to use for separating fields of a row (default: comma)
separator = ','
# End-of-line marker to use (default: CRLF)
lineend = '\015\012'
def __init__(self, separator=None, lineend=None):
if separator is not None:
self.separator = separator
if lineend is not None:
self.lineend = lineend
def flush(self):
""" Flush the data currently stored in the writer.
"""
self.text = ''
self.columns = None
def set_columns(self, columns, header=1,
join=string.join):
""" Sets the output columns.
If header is true, a column name line is added to the
output.
Columns can only be set once per session and must be set
prior to adding any data. columns has to be a list of
column names.
It is assured that no more than len(columns) items are
written for each row. All rows are filled up with ""
entries to have an equal number of items.
"""
if columns == self.columns and not header:
# Nothing to do
return
elif self.columns:
raise Error,'cannot write columns more than once per session'
self.columns = columns
if header:
if self.text:
raise Error,'cannot add header to already written data'
headerline = self._quote(columns)
self.text = join(headerline, self.separator) + self.lineend
def feed_list(self,table,
join=string.join):
""" Feeds a table (list of rows) which is converted
to CSV.
No more than len(columns) items are written for each
row. All rows are filled up with "" entries to have an
equal number of items. None entries are converted to empty
strings, all other objects are stringified.
"""
columns = self.columns
if columns:
rowlen = len(columns)
else:
# Calculate the max. number of columns in the table
rowlen = max(map(len,table))
# Prepare an empty table
t = [None] * len(table)
_quote = self._quote
# Fill in data
for i,row in irange(table):
row = _quote(row[:rowlen])
if len(row) < rowlen:
row[len(row):] = ['""'] * (rowlen - len(row))
t[i] = join(row,self.separator)
# Add final CRLF and add as CSV text
t.append('')
self.text = self.text + join(t,self.lineend)
def feed_dict(self,table,rows=None,
join=string.join):
""" Feeds a table (dict of lists) which is converted
to CSV.
Only the keys set as column names are used to form the CSV
data.
All lists in the dictionary must have equal length or at
least rows number of entries, if rows is given. None
entries are converted to empty strings, all other objects
are stringified.
"""
columns = self.columns
if not columns:
raise Error,'no output columns set'
rowlen = len(columns)
# Create an emtpy table
if not rows:
rows = 0
for column in columns:
nrows = len(table[column])
if nrows > rows:
rows = nrows
rowindices = trange(rows)
t = [None] * rows
for i in rowindices:
t[i] = [None] * rowlen
# Fill the table
for j,k in irange(columns):
for i in rowindices:
t[i][j] = table[k][i]
# Quote and join lines
t = map(self._quote,t)
t = map(join,t,(self.separator,) * rows)
# Add final CRLF and store CSV text
t.append('')
self.text = self.text + join(t,self.lineend)
def feed_objects(self,objects,
join=string.join,getattr=getattr):
""" Feeds a sequence of objects which is converted to CSV.
For each object the set column names are interpreted as
object attributes and used as basis for the CSV data.
None values are converted to empty strings, all other
attributes are added stringified.
"""
columns = self.columns
if not columns:
raise Error,'no output columns set'
rowlen = len(columns)
# Create an emtpy table
rows = len(objects)
rowindices = trange(rows)
t = [None] * rows
for i in rowindices:
t[i] = [None] * rowlen
# Fill the table
icols = irange(columns)
for i in rowindices:
obj = objects[i]
for j,name in icols:
t[i][j] = str(getattr(obj, name))
# Quote and join lines
t = map(self._quote, t)
t = map(join,t,(self.separator,) * rows)
# Add final CRLF and store CSV text
t.append('')
self.text = self.text + join(t,self.lineend)
def dumps(self):
""" Returns the data as CSV text
"""
return self.text
def dump(self, file,
StringType=types.StringType):
""" Write the converted CSV data to a file
"""
if type(file) == StringType:
file = open(file,'wb')
file.write(self.text)
file.close()
else:
file.write(self.text)
def _quote(self, line,
replace=string.replace,str=str):
""" CSV style quote the given line of text.
"""
nline = ['""'] * len(line)
for i,item in irange(line):
if item is not None:
text = str(item)
else:
text = ''
nline[i] = '"%s"' % replace(text,'"','""')
return nline
def _test():
import sys
global _debug
# Turn on debugging output
#_debug = 1
s = """"key1","key2",key3,key4
"abc",0,,0,"def""ghi
Eine neue Zeile
0","hallo",
"line",with"errors,
"text with,embedded,commas",2,3,4
newline,new,luck,;123,""\015
\"""Quote""\",1,2,3,"""#"
r = Reader()
r.loads(s)
lt = r.list()
print lt
print r.dictionary()
r.set_columns(('Name','Vorname'))
print r.dictionary()
print r.description()
print '-' * 72
w = Writer()
w.feed_list(r.list())
w.feed_list(r.list())
s = w.dumps()
print s
#w.dump('test.txt')
w.flush()
dict = r.dictionary()
w.set_columns(dict.keys())
w.feed_dict(dict)
s = w.dumps()
print '-->',s
r.flush()
r.loads(s)
lt = r.list()
print lt
print r.dictionary()
r.set_columns(('Name','Vorname'))
print r.dictionary()
print r.description()
print '-' * 72
class Obj:
pass
data = []
for i in range(10):
o = Obj()
o.id = i
o.str = str(i)
data.append(o)
w.flush()
w.set_columns(('id', 'str'))
w.feed_objects(data)
w.feed_objects(data)
s = w.dumps()
print '-->',s
r.flush()
r.loads(s)
lt = r.list()
print lt
print r.dictionary()
r.set_columns(('id','str'))
print r.dictionary()
print r.description()
print '-' * 72
print
print 'Read File:',sys.argv[1]
r.flush()
f = open(sys.argv[1])
r.load(f)
print r
if __name__ == '__main__':
_test()