|
View:
New views
3 Messages
—
Rating Filter:
Alert me
|
|
|
fix for reading funny compressed data, for reviewI'd like to push this to the stable branch. Comments
appreciated. commit e624e2da6ea68d22e6d4fba4eaa96d37d07a6730 Author: Ben Pfaff <blp@...> Date: Wed Oct 14 21:20:44 2009 -0700 sys-file-reader: Tolerate nonsensical opcodes in compressed data. Compressed data in .sav files uses a set of 256 opcodes, some of which make sense only for numeric data and others of which only make sense for string data. However, Jereme Thomas <jereme.thomas@...> has provided one file, written by SPSS 14, that uses an opcode that seems to makes sense only for numeric data in a string field. So this commit adds support for these opcodes, although it still warns about the ones other than the exact one found in the file provided by Jereme. diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index 70fa385..b1be385 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -884,6 +884,9 @@ value @var{code} - @var{bias}, where variable @code{bias} from the file header. For example, code 105 with bias 100.0 (the normal value) indicates a numeric variable of value 5. +One file has been seen written by SPSS 14 that contained such a code +in a @emph{string} field with the value 0 (after the bias is +subtracted) as a way of encoding null bytes. @item 252 End of file. This code may or may not appear at the end of the data diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index fe7b533..8d973e4 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -86,6 +86,7 @@ struct sfm_reader double bias; /* Compression bias, usually 100.0. */ uint8_t opcodes[8]; /* Current block of opcodes. */ size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ + bool corruption_warning; /* Warned about possible corruption? */ }; static const struct casereader_class sys_file_casereader_class; @@ -192,6 +193,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, r->oct_cnt = 0; r->has_long_var_names = false; r->opcode_idx = sizeof r->opcodes; + r->corruption_warning = false; /* TRANSLATORS: this fragment will be interpolated into messages in fh_lock() that identify types of files. */ @@ -1374,7 +1376,14 @@ read_compressed_number (struct sfm_reader *r, double *d) break; case 254: - sys_error (r, _("Compressed data is corrupt.")); + float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d); + if (!r->corruption_warning) + { + r->corruption_warning = true; + sys_warn (r, _("Possible compressed data corruption: " + "compressed spaces appear in numeric field.")); + } + break; case 255: *d = SYSMIS; @@ -1395,7 +1404,8 @@ read_compressed_number (struct sfm_reader *r, double *d) static bool read_compressed_string (struct sfm_reader *r, char *dst) { - switch (read_opcode (r)) + int opcode = read_opcode (r); + switch (opcode) { case -1: case 252: @@ -1410,7 +1420,25 @@ read_compressed_string (struct sfm_reader *r, char *dst) break; default: - sys_error (r, _("Compressed data is corrupt.")); + { + double value = opcode - r->bias; + float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst); + if (value == 0.0) + { + /* This has actually been seen "in the wild". The submitter of the + file that showed that the contents decoded as spaces, but they + were at the end of the field so it's possible that the null + bytes just acted as null terminators. */ + } + else if (!r->corruption_warning) + { + r->corruption_warning = true; + sys_warn (r, _("Possible compressed data corruption: " + "string contains compressed integer (opcode %d)"), + opcode); + } + } + break; } return true; -- Peter Seebach on managing engineers: "It's like herding cats, only most of the engineers are already sick of laser pointers." _______________________________________________ pspp-dev mailing list pspp-dev@... http://lists.gnu.org/mailman/listinfo/pspp-dev |
|
|
Re: fix for reading funny compressed data, for reviewIt seems reasonable to me.
J' On Wed, Oct 14, 2009 at 09:44:52PM -0700, Ben Pfaff wrote: I'd like to push this to the stable branch. Comments appreciated. commit e624e2da6ea68d22e6d4fba4eaa96d37d07a6730 Author: Ben Pfaff <blp@...> Date: Wed Oct 14 21:20:44 2009 -0700 sys-file-reader: Tolerate nonsensical opcodes in compressed data. Compressed data in .sav files uses a set of 256 opcodes, some of which make sense only for numeric data and others of which only make sense for string data. However, Jereme Thomas <jereme.thomas@...> has provided one file, written by SPSS 14, that uses an opcode that seems to makes sense only for numeric data in a string field. So this commit adds support for these opcodes, although it still warns about the ones other than the exact one found in the file provided by Jereme. diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index 70fa385..b1be385 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -884,6 +884,9 @@ value @var{code} - @var{bias}, where variable @code{bias} from the file header. For example, code 105 with bias 100.0 (the normal value) indicates a numeric variable of value 5. +One file has been seen written by SPSS 14 that contained such a code +in a @emph{string} field with the value 0 (after the bias is +subtracted) as a way of encoding null bytes. @item 252 End of file. This code may or may not appear at the end of the data diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index fe7b533..8d973e4 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -86,6 +86,7 @@ struct sfm_reader double bias; /* Compression bias, usually 100.0. */ uint8_t opcodes[8]; /* Current block of opcodes. */ size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ + bool corruption_warning; /* Warned about possible corruption? */ }; static const struct casereader_class sys_file_casereader_class; @@ -192,6 +193,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, r->oct_cnt = 0; r->has_long_var_names = false; r->opcode_idx = sizeof r->opcodes; + r->corruption_warning = false; /* TRANSLATORS: this fragment will be interpolated into messages in fh_lock() that identify types of files. */ @@ -1374,7 +1376,14 @@ read_compressed_number (struct sfm_reader *r, double *d) break; case 254: - sys_error (r, _("Compressed data is corrupt.")); + float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d); + if (!r->corruption_warning) + { + r->corruption_warning = true; + sys_warn (r, _("Possible compressed data corruption: " + "compressed spaces appear in numeric field.")); + } + break; case 255: *d = SYSMIS; @@ -1395,7 +1404,8 @@ read_compressed_number (struct sfm_reader *r, double *d) static bool read_compressed_string (struct sfm_reader *r, char *dst) { - switch (read_opcode (r)) + int opcode = read_opcode (r); + switch (opcode) { case -1: case 252: @@ -1410,7 +1420,25 @@ read_compressed_string (struct sfm_reader *r, char *dst) break; default: - sys_error (r, _("Compressed data is corrupt.")); + { + double value = opcode - r->bias; + float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst); + if (value == 0.0) + { + /* This has actually been seen "in the wild". The submitter of the + file that showed that the contents decoded as spaces, but they + were at the end of the field so it's possible that the null + bytes just acted as null terminators. */ + } + else if (!r->corruption_warning) + { + r->corruption_warning = true; + sys_warn (r, _("Possible compressed data corruption: " + "string contains compressed integer (opcode %d)"), + opcode); + } + } + break; } return true; -- Peter Seebach on managing engineers: "It's like herding cats, only most of the engineers are already sick of laser pointers." _______________________________________________ pspp-dev mailing list pspp-dev@... http://lists.gnu.org/mailman/listinfo/pspp-dev -- PGP Public key ID: 1024D/2DE827B3 fingerprint = 8797 A26D 0854 2EAB 0285 A290 8A67 719C 2DE8 27B3 See http://pgp.mit.edu or any PGP keyserver for public key. _______________________________________________ pspp-dev mailing list pspp-dev@... http://lists.gnu.org/mailman/listinfo/pspp-dev |
|
|
Re: fix for reading funny compressed data, for reviewThanks. I pushed it out.
John Darrington <john@...> writes: > It seems reasonable to me. > > J' > > On Wed, Oct 14, 2009 at 09:44:52PM -0700, Ben Pfaff wrote: > I'd like to push this to the stable branch. Comments > appreciated. > > commit e624e2da6ea68d22e6d4fba4eaa96d37d07a6730 > Author: Ben Pfaff <blp@...> > Date: Wed Oct 14 21:20:44 2009 -0700 > > sys-file-reader: Tolerate nonsensical opcodes in compressed data. > > Compressed data in .sav files uses a set of 256 opcodes, some of which make > sense only for numeric data and others of which only make sense for string > data. However, Jereme Thomas <jereme.thomas@...> has provided one > file, written by SPSS 14, that uses an opcode that seems to makes sense > only for numeric data in a string field. So this commit adds support for > these opcodes, although it still warns about the ones other than the exact > one found in the file provided by Jereme. > > diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi > index 70fa385..b1be385 100644 > --- a/doc/dev/system-file-format.texi > +++ b/doc/dev/system-file-format.texi > @@ -884,6 +884,9 @@ value @var{code} - @var{bias}, where > variable @code{bias} from the file header. For example, > code 105 with bias 100.0 (the normal value) indicates a numeric variable > of value 5. > +One file has been seen written by SPSS 14 that contained such a code > +in a @emph{string} field with the value 0 (after the bias is > +subtracted) as a way of encoding null bytes. > > @item 252 > End of file. This code may or may not appear at the end of the data > diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c > index fe7b533..8d973e4 100644 > --- a/src/data/sys-file-reader.c > +++ b/src/data/sys-file-reader.c > @@ -86,6 +86,7 @@ struct sfm_reader > double bias; /* Compression bias, usually 100.0. */ > uint8_t opcodes[8]; /* Current block of opcodes. */ > size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ > + bool corruption_warning; /* Warned about possible corruption? */ > }; > > static const struct casereader_class sys_file_casereader_class; > @@ -192,6 +193,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, > r->oct_cnt = 0; > r->has_long_var_names = false; > r->opcode_idx = sizeof r->opcodes; > + r->corruption_warning = false; > > /* TRANSLATORS: this fragment will be interpolated into > messages in fh_lock() that identify types of files. */ > @@ -1374,7 +1376,14 @@ read_compressed_number (struct sfm_reader *r, double *d) > break; > > case 254: > - sys_error (r, _("Compressed data is corrupt.")); > + float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d); > + if (!r->corruption_warning) > + { > + r->corruption_warning = true; > + sys_warn (r, _("Possible compressed data corruption: " > + "compressed spaces appear in numeric field.")); > + } > + break; > > case 255: > *d = SYSMIS; > @@ -1395,7 +1404,8 @@ read_compressed_number (struct sfm_reader *r, double *d) > static bool > read_compressed_string (struct sfm_reader *r, char *dst) > { > - switch (read_opcode (r)) > + int opcode = read_opcode (r); > + switch (opcode) > { > case -1: > case 252: > @@ -1410,7 +1420,25 @@ read_compressed_string (struct sfm_reader *r, char *dst) > break; > > default: > - sys_error (r, _("Compressed data is corrupt.")); > + { > + double value = opcode - r->bias; > + float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst); > + if (value == 0.0) > + { > + /* This has actually been seen "in the wild". The submitter of the > + file that showed that the contents decoded as spaces, but they > + were at the end of the field so it's possible that the null > + bytes just acted as null terminators. */ > + } > + else if (!r->corruption_warning) > + { > + r->corruption_warning = true; > + sys_warn (r, _("Possible compressed data corruption: " > + "string contains compressed integer (opcode %d)"), > + opcode); > + } > + } > + break; > } > > return true; > > -- > Peter Seebach on managing engineers: > "It's like herding cats, only most of the engineers are already > sick of laser pointers." > > > _______________________________________________ > pspp-dev mailing list > pspp-dev@... > http://lists.gnu.org/mailman/listinfo/pspp-dev -- Peter Seebach on managing engineers: "It's like herding cats, only most of the engineers are already sick of laser pointers." _______________________________________________ pspp-dev mailing list pspp-dev@... http://lists.gnu.org/mailman/listinfo/pspp-dev |
| Free embeddable forum powered by Nabble | Forum Help |