View file File name : csv.sl Content :% Copyright (C) 2012-2017,2018 John E. Davis % % This file is part of the S-Lang Library and may be distributed under the % terms of the GNU General Public License. See the file COPYING for % more information. %--------------------------------------------------------------------------- import ("csv"); private define read_fp_callback (info) { variable line, comment_char = info.comment_char; forever { if (-1 == fgets (&line, info.fp)) return NULL; if ((line[0] == comment_char) && (0 == strnbytecmp (line, info.comment, info.comment_len))) continue; return line; } } private define read_strings_callback (str_info) { variable line; if (str_info.output_crlf) { str_info.output_crlf = 0; return "\n"; } variable i = str_info.i; if (i >= str_info.n) return NULL; line = str_info.strings[i]; str_info.i = i+1; if (line[-1] != '\n') str_info.output_crlf = 1; return line; } private define resize_arrays (list, n) { _for (0, length(list)-1, 1) { variable i = (); variable a = list[i]; variable m = length(a); if (m > n) { list[i] = a[[:n-1]]; continue; } variable b = _typeof(a)[n]; b[[:m-1]] = a; list[i] = b; } } private define atofloat (x) { typecast (atof(x), Float_Type); } private define get_blankrows_bits (val) { if (val == "skip") return CSV_SKIP_BLANK_ROWS; if (val == "stop") return CSV_STOP_BLANK_ROWS; return 0; } private define read_row (csv) { % The blank row handling default is to use that of the csv object. if (qualifier_exists ("blankrows")) { return _csv_decode_row (csv.decoder, get_blankrows_bits (qualifier("blankrows"))); } return _csv_decode_row (csv.decoder); } private define fixup_header_names (names) { if (names == NULL) return names; if (typeof (names) == List_Type) names = list_to_array (names); if (_typeof(names) != String_Type) return names; variable is_scalar = (typeof (names) != Array_Type); if (is_scalar) names = [names]; names = strlow (names); variable i = where (names == ""); names[i] = array_map (String_Type, &sprintf, "col%d", i+1); #iffalse % This code is nolonger necessary since slang now allows arbitrary % structure names. names = strtrans (names, "^\\w", "_"); names = strcompress (names, "_"); _for i (0, length(names)-1, 1) { if ('0' <= names[i][0] <= '9') names[i] = "_" + names[i]; } #endif if (is_scalar) names = names[0]; return names; } private define pop_columns_as_array (n) { if (n == 0) return String_Type[0]; try { % allow a mixture of arrays and scalars variable columns = __pop_list (n); columns = [__push_list(columns)]; return columns; } catch TypeMismatchError: { throw TypeMismatchError, "Column arguments cannot be a mixture of ints and strings"; } } private define read_cols () { if ((_NARGS == 0) || (qualifier_exists ("help"))) { usage("struct = .readcol ([columns] ; qualifiers)\n\ where columns is an optional 1-based array of column numbers,\n\ or array of column names.\n\ Qualifiers:\n\ header=header, fields=[array of field names],\n\ type=value|array|string of 's','i','l','f','d' (str,int,long,float,dbl)\n\ typeNTH=val (specifiy type for NTH column)\n\ snan=\"\", inan=0, lnan=0L, fnan=_NaN, dnan=_NaN (defaults for empty fields),\n\ nanNTH=val (value used for an empty field in the NTH column\n\ " ); } variable columns = NULL; if (_NARGS > 1) { columns = pop_columns_as_array (_NARGS-1); } variable csv = (); variable fields = qualifier ("fields"); variable header = qualifier ("header"); variable types = qualifier ("type"); variable snan = qualifier ("snan", ""); variable dnan = qualifier ("dnan", _NaN); variable fnan = qualifier ("fnan", typecast(_NaN,Float_Type)); variable inan = qualifier ("inan", 0); variable lnan = qualifier ("lnan", 0L); if ((fields != NULL) && (columns != NULL) && (length(fields) != length(columns))) throw InvalidParmError, "The fields qualifier must be the same size as the number of columns"; variable flags = get_blankrows_bits (qualifier("blankrows", "skip")); header = fixup_header_names (header); columns = fixup_header_names (columns); variable columns_are_string = _typeof(columns) == String_Type; if ((header == NULL) && columns_are_string) throw InvalidParmError, "No header was supplied to map column names"; variable column_ints = columns, col, i, j; if (columns_are_string) { column_ints = Int_Type[length(columns)]; _for i (0, length(columns)-1, 1) { col = columns[i]; j = wherefirst (col == header); if (j == NULL) throw InvalidParmError, "Unknown (canonical) column name $col"; column_ints[i] = j+1; } } variable row_data = _csv_decode_row (csv.decoder, flags); if (column_ints == NULL) column_ints = [1:length(row_data)]; if (any(column_ints>length(row_data))) { throw InvalidParmError, "column number is too large for data"; } variable ncols = length(column_ints); variable datastruct = NULL; if (fields == NULL) { if (columns_are_string) fields = columns; else if (header != NULL) fields = header[column_ints-1]; else fields = array_map(String_Type, &sprintf, "col%d", column_ints); } datastruct = @Struct_Type(fields); column_ints -= 1; % make 0-based variable convert_funcs = Ref_Type[ncols], convert_func, val; variable nan_values = {}; loop(ncols) list_append(nan_values, snan); if (types == NULL) { types = qualifier_exists ("auto") ? 'A' : 's'; } if (typeof(types) == List_Type) types = list_to_array (types); if (typeof(types) == String_Type) types = bstring_to_array (types); if ((typeof(types) == Array_Type) && (length(types) != ncols)) throw InvalidParmError, "types array must be equal to the number of columns"; if (typeof (types) != Array_Type) types = types[Int_Type[ncols]]; % single (default) type specified variable i1; _for i (1, ncols, 1) { i1 = i-1; types[i1] = qualifier ("type$i"$, types[i1]); } i = where(types=='i'); convert_funcs[i] = &atoi; nan_values[i] = typecast(inan, Int_Type); i = where(types=='l'); convert_funcs[i] = &atol; nan_values[i] = typecast(lnan, Long_Type); i = where(types=='f'); convert_funcs[i] = &atofloat; nan_values[i] = typecast (fnan, Float_Type); i = where(types=='d'); convert_funcs[i] = &atof; nan_values[i] = typecast(dnan, Double_Type); _for i (1, ncols, 1) { i1 = i-1; if (types[i1] == 'A') { variable type = _slang_guess_type (row_data[i1]); if (type == Double_Type) { convert_funcs[i1] = &atof; nan_values[i1] = dnan; types[i1] = 'd'; } else if (type == Int_Type) { convert_funcs[i1] = &atoi; nan_values[i1] = inan; types[i1] = 'i'; } else types[i1] = 's'; } val = nan_values[i1]; nan_values[i1] = typecast (qualifier ("nan$i"$, val), typeof(val)); } variable list_of_arrays = {}, array; variable init_size = 0x8000; variable dsize = init_size; variable max_allocated = init_size; _for i (0, ncols-1, 1) { val = row_data[column_ints[i]]; array = typeof(nan_values[i])[max_allocated]; ifnot (strbytelen(val)) val = nan_values[i]; else { convert_func = convert_funcs[i]; if (convert_func != NULL) val = (@convert_func)(val); } array[0] = val; list_append (list_of_arrays, array); } variable nread = 1; variable min_row_size = 1+max(column_ints); while (row_data = _csv_decode_row (csv.decoder, flags), row_data != NULL) { if (length (row_data) < min_row_size) { % FIXME-- make what to do here configurable if (length(row_data) == 0) break; continue; } if (nread >= max_allocated) { max_allocated += dsize; resize_arrays (list_of_arrays, max_allocated); } _for i (0, ncols-1, 1) { val = row_data[column_ints[i]]; ifnot (strbytelen(val)) { list_of_arrays[i][nread] = nan_values[i]; continue; } convert_func = convert_funcs[i]; if (convert_func == NULL) { list_of_arrays[i][nread] = val; continue; } list_of_arrays[i][nread] = (@convert_func)(val); } nread++; } resize_arrays (list_of_arrays, nread); set_struct_fields (datastruct, __push_list(list_of_arrays)); return datastruct; } define csv_decoder_new () { if (_NARGS != 1) usage ("\ obj = csv_decoder_new (file|fp|strings ; qualifiers);\n\ Qualifiers:\n\ quote='\"', delim=',', skiplines=0, comment=string"); variable fp = (); variable type = typeof(fp); variable func = &read_fp_callback; variable func_data; variable skiplines = qualifier("skiplines", 0); variable delim = qualifier("delim", ','); variable quote = qualifier("quote", '"'); variable comment = qualifier("comment", NULL); variable comment_char = (comment == NULL) ? NULL : comment[0]; variable flags = get_blankrows_bits (qualifier("blankrows", "skip")); if ((type == Array_Type) || (type == List_Type)) { func = &read_strings_callback; func_data = struct { strings = fp, i = skiplines, n = length(fp), output_crlf = 0, comment_char = comment_char, comment = comment, }; } else { if (type != File_Type) { fp = fopen (fp, "r"); if (fp == NULL) throw OpenError, "Unable to open CSV file"$; } func_data = struct { fp = fp, comment_char = comment_char, comment = comment, comment_len = ((comment == NULL) ? 0 : strbytelen(comment)), }; variable line; loop (skiplines) () = fgets (&line, fp); } variable csv = struct { decoder = _csv_decoder_new (func, func_data, delim, quote, flags), readrow = &read_row, readcol = &read_cols, }; return csv; } % Encoder private define writecol () { if ((_NARGS < 3) || qualifier_exists("help")) { usage("\ writecol (file|fp, list_of_column_data | datastruct | col1,col2,...)\n\ Qualifiers:\n\ names=array-of-column-names, noheader, quoteall, quotesome, rdb\n\ " ); } variable csv, data, file; if (_NARGS == 3) { (csv, file, data) = (); } else { data = __pop_list (_NARGS-2); (csv, file) = (); } variable type = typeof (data); if ((type != List_Type) && (type != Array_Type) && not is_struct_type (data)) data = {data}; variable flags = 0; if (qualifier_exists ("quoteall")) flags |= CSV_QUOTE_ALL; if (qualifier_exists ("quotesome")) flags |= CSV_QUOTE_SOME; variable rdb = qualifier_exists ("rdb"); variable fp = file; if (typeof(file) != File_Type) fp = fopen (file, "wb"); if (fp == NULL) throw OpenError, "Error opening $file in write mode"$; variable names = NULL; ifnot (qualifier_exists ("noheader")) { names = qualifier ("names"); if ((names == NULL) && is_struct_type (data)) names = get_struct_field_names (data); } if (is_struct_type (data)) { variable tmp = {}; data = {(_push_struct_field_values(data), pop())}; list_reverse (data); } EXIT_BLOCK { ifnot (__is_same(file, fp)) { if (-1 == fclose (fp)) throw WriteError, "Error closing $file"$; } } variable ncols = length(data); if (length (data) == 0) return; variable nrows = length(data[0]), i, j; _for i (1, ncols-1, 1) { if (nrows != length(data[i])) throw InvalidParmError, "CSV data columns must be the same length"; } variable str, encoder = csv.encoder; if (names != NULL) { if (typeof (names) == List_Type) names = list_to_array (names); str = _csv_encode_row (encoder, names, flags); if (-1 == fputs (str, fp)) throw WriteError, "Write to CSV file failed"; if (rdb) { variable types = String_Type[ncols]; _for i (0, ncols-1, 1) types[i] = __is_datatype_numeric (_typeof(data[i])) ? "N" : "S"; str = _csv_encode_row (encoder, types, flags); if (-1 == fputs (str, fp)) throw WriteError, "Write to CSV file failed"; } } variable row_data = String_Type[ncols]; _for i (0, nrows-1, 1) { _for j (0, ncols-1, 1) row_data[j] = string (data[j][i]); str = _csv_encode_row (encoder, row_data, flags); if (-1 == fputs (str, fp)) throw WriteError, "Write to CSV file failed"; } } define csv_encoder_new () { if (qualifier_exists ("help")) { usage ("csv = csv_encoder_new ();\n\ Qualifiers:\n\ delim=','\n\ quote='\"'\n\ quotesome, quoteall\n\ rdb\n\ " ); } variable flags = 0; if (qualifier_exists ("quoteall")) flags |= CSV_QUOTE_ALL; if (qualifier_exists ("quotesome")) flags |= CSV_QUOTE_SOME; variable quotechar = qualifier ("quote", '"'); variable delimchar = qualifier ("delim", qualifier_exists ("rdb") ? '\t' : ','); variable csv = struct { encoder = _csv_encoder_new (delimchar, quotechar, flags), writecol = &writecol, }; return csv; } define csv_writecol () { if ((_NARGS < 2) || qualifier_exists("help")) { usage("\ csv_writecol (file|fp, list_of_column_data | datastruct | col1,col2,...)\n\ Qualifiers:\n\ names=array-of-column-names, noheader, quote=val, quoteall, quotesome\n\ " ); } variable args = __pop_list (_NARGS); variable csv = csv_encoder_new (;;__qualifiers); csv.writecol (__push_list(args);;__qualifiers); } private define convert_to_numeric (s, name) { variable val = get_struct_field (s, name); variable num = length (val); if ((num == 0) || (_typeof (val) != String_Type)) return; EXIT_BLOCK { set_struct_field (s, name, val); } variable types = DataType_Type[num]; _for (0, length (val)-1, 1) { variable i = (); variable type = _slang_guess_type (val[i]); if (type == Double_Type) { val = atof (val); return; } types[i] = type; } if (all (types == Int_Type)) { val = atoi (val); return; } if (any (types == Float_Type)) { val = atofloat (val); return; } if (any (types == Long_Type)) { val = atol (val); return; } if (any (types == Int_Type)) { val = atoi (val); return; } val = atof (val); } define csv_readcol () { if ((_NARGS == 0) || qualifier_exists("help")) { usage ("struct = csvreadcol (file|fp [,columns] ;qualifier)\n\ where columns is an optional 1-based array of column numbers,\n\ or array of column names.\n\ Qualifiers:\n\ quote='\"', delim=',', skiplines=0, comment=string, has_header,\n\ header=header, fields=[array of field names],\n\ type=value|array of 's','i','l','f','d' (string,int,long,float,double)\n\ typeNTH=val (specifiy type for NTH column)\n\ snan=\"\", inan=0, lnan=0L, fnan=_NaN, dnan=_NaN (defaults for empty fields),\n\ nanNTH=val (value used for an empty field in the NTH column\n\ " ); } variable file, columns; columns = __pop_list (_NARGS-1); file = (); variable q = __qualifiers (); variable rdb = qualifier_exists ("rdb"); % rdb files are tab-delimited files, # is a comment character, % the first non-comment line contains the field names, the % second line gives the field types. if (rdb) { q = struct { comment = "#", delim = '\t' }; } variable types = NULL; variable csv = csv_decoder_new (file ;; q); if (rdb || qualifier_exists ("has_header")) { variable header = csv.readrow (); q = struct { header=header, @q }; if (rdb) { % The type field consists of an integer, followed by a % type specifier, and a justification character. The % integer and justification characters are for display % purposes. The type specifier is N for numberic, S for % string, M for month. Here, M and S will be treated the % same. types = csv.readrow (); types = strtrans (types, "0-9<>", ""); } } variable s = csv.readcol (__push_list(columns) ;; q); if (rdb) { ifnot (length (columns)) columns = header; header = fixup_header_names (header); foreach (columns) { variable col = (); if (typeof (col) == String_Type) col = fixup_header_names (col); else col = header[col-1]; variable i = wherefirst (col == header); if ((i == NULL) || (types[i] != "N")) continue; convert_to_numeric (s, col); } } return s; }