fix for reading funny compressed data, for review

View: New views
3 Messages — Rating Filter:   Alert me  

fix for reading funny compressed data, for review

by Ben Pfaff :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

I'd like to push this to the stable branch.  Comments
appreciated.

commit e624e2da6ea68d22e6d4fba4eaa96d37d07a6730
Author: Ben Pfaff <blp@...>
Date:   Wed Oct 14 21:20:44 2009 -0700

    sys-file-reader: Tolerate nonsensical opcodes in compressed data.
   
    Compressed data in .sav files uses a set of 256 opcodes, some of which make
    sense only for numeric data and others of which only make sense for string
    data.  However, Jereme Thomas <jereme.thomas@...> has provided one
    file, written by SPSS 14, that uses an opcode that seems to makes sense
    only for numeric data in a string field.  So this commit adds support for
    these opcodes, although it still warns about the ones other than the exact
    one found in the file provided by Jereme.

diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi
index 70fa385..b1be385 100644
--- a/doc/dev/system-file-format.texi
+++ b/doc/dev/system-file-format.texi
@@ -884,6 +884,9 @@ value @var{code} - @var{bias}, where
 variable @code{bias} from the file header.  For example,
 code 105 with bias 100.0 (the normal value) indicates a numeric variable
 of value 5.
+One file has been seen written by SPSS 14 that contained such a code
+in a @emph{string} field with the value 0 (after the bias is
+subtracted) as a way of encoding null bytes.
 
 @item 252
 End of file.  This code may or may not appear at the end of the data
diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c
index fe7b533..8d973e4 100644
--- a/src/data/sys-file-reader.c
+++ b/src/data/sys-file-reader.c
@@ -86,6 +86,7 @@ struct sfm_reader
     double bias; /* Compression bias, usually 100.0. */
     uint8_t opcodes[8];         /* Current block of opcodes. */
     size_t opcode_idx;          /* Next opcode to interpret, 8 if none left. */
+    bool corruption_warning;    /* Warned about possible corruption? */
   };
 
 static const struct casereader_class sys_file_casereader_class;
@@ -192,6 +193,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
   r->oct_cnt = 0;
   r->has_long_var_names = false;
   r->opcode_idx = sizeof r->opcodes;
+  r->corruption_warning = false;
 
   /* TRANSLATORS: this fragment will be interpolated into
      messages in fh_lock() that identify types of files. */
@@ -1374,7 +1376,14 @@ read_compressed_number (struct sfm_reader *r, double *d)
       break;
 
     case 254:
-      sys_error (r, _("Compressed data is corrupt."));
+      float_convert (r->float_format, "        ", FLOAT_NATIVE_DOUBLE, d);
+      if (!r->corruption_warning)
+        {
+          r->corruption_warning = true;
+          sys_warn (r, _("Possible compressed data corruption: "
+                         "compressed spaces appear in numeric field."));
+        }
+      break;
 
     case 255:
       *d = SYSMIS;
@@ -1395,7 +1404,8 @@ read_compressed_number (struct sfm_reader *r, double *d)
 static bool
 read_compressed_string (struct sfm_reader *r, char *dst)
 {
-  switch (read_opcode (r))
+  int opcode = read_opcode (r);
+  switch (opcode)
     {
     case -1:
     case 252:
@@ -1410,7 +1420,25 @@ read_compressed_string (struct sfm_reader *r, char *dst)
       break;
 
     default:
-      sys_error (r, _("Compressed data is corrupt."));
+      {
+        double value = opcode - r->bias;
+        float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
+        if (value == 0.0)
+          {
+            /* This has actually been seen "in the wild".  The submitter of the
+               file that showed that the contents decoded as spaces, but they
+               were at the end of the field so it's possible that the null
+               bytes just acted as null terminators. */
+          }
+        else if (!r->corruption_warning)
+          {
+            r->corruption_warning = true;
+            sys_warn (r, _("Possible compressed data corruption: "
+                           "string contains compressed integer (opcode %d)"),
+                      opcode);
+          }
+      }
+      break;
     }
 
   return true;

--
Peter Seebach on managing engineers:
"It's like herding cats, only most of the engineers are already
 sick of laser pointers."


_______________________________________________
pspp-dev mailing list
pspp-dev@...
http://lists.gnu.org/mailman/listinfo/pspp-dev

Re: fix for reading funny compressed data, for review

by John Darrington-3 :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

It seems reasonable to me.

J'

On Wed, Oct 14, 2009 at 09:44:52PM -0700, Ben Pfaff wrote:
     I'd like to push this to the stable branch.  Comments
     appreciated.
     
     commit e624e2da6ea68d22e6d4fba4eaa96d37d07a6730
     Author: Ben Pfaff <blp@...>
     Date:   Wed Oct 14 21:20:44 2009 -0700
     
         sys-file-reader: Tolerate nonsensical opcodes in compressed data.
         
         Compressed data in .sav files uses a set of 256 opcodes, some of which make
         sense only for numeric data and others of which only make sense for string
         data.  However, Jereme Thomas <jereme.thomas@...> has provided one
         file, written by SPSS 14, that uses an opcode that seems to makes sense
         only for numeric data in a string field.  So this commit adds support for
         these opcodes, although it still warns about the ones other than the exact
         one found in the file provided by Jereme.
     
     diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi
     index 70fa385..b1be385 100644
     --- a/doc/dev/system-file-format.texi
     +++ b/doc/dev/system-file-format.texi
     @@ -884,6 +884,9 @@ value @var{code} - @var{bias}, where
      variable @code{bias} from the file header.  For example,
      code 105 with bias 100.0 (the normal value) indicates a numeric variable
      of value 5.
     +One file has been seen written by SPSS 14 that contained such a code
     +in a @emph{string} field with the value 0 (after the bias is
     +subtracted) as a way of encoding null bytes.
     
      @item 252
      End of file.  This code may or may not appear at the end of the data
     diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c
     index fe7b533..8d973e4 100644
     --- a/src/data/sys-file-reader.c
     +++ b/src/data/sys-file-reader.c
     @@ -86,6 +86,7 @@ struct sfm_reader
          double bias; /* Compression bias, usually 100.0. */
          uint8_t opcodes[8];         /* Current block of opcodes. */
          size_t opcode_idx;          /* Next opcode to interpret, 8 if none left. */
     +    bool corruption_warning;    /* Warned about possible corruption? */
        };
     
      static const struct casereader_class sys_file_casereader_class;
     @@ -192,6 +193,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
        r->oct_cnt = 0;
        r->has_long_var_names = false;
        r->opcode_idx = sizeof r->opcodes;
     +  r->corruption_warning = false;
     
        /* TRANSLATORS: this fragment will be interpolated into
           messages in fh_lock() that identify types of files. */
     @@ -1374,7 +1376,14 @@ read_compressed_number (struct sfm_reader *r, double *d)
            break;
     
          case 254:
     -      sys_error (r, _("Compressed data is corrupt."));
     +      float_convert (r->float_format, "        ", FLOAT_NATIVE_DOUBLE, d);
     +      if (!r->corruption_warning)
     +        {
     +          r->corruption_warning = true;
     +          sys_warn (r, _("Possible compressed data corruption: "
     +                         "compressed spaces appear in numeric field."));
     +        }
     +      break;
     
          case 255:
            *d = SYSMIS;
     @@ -1395,7 +1404,8 @@ read_compressed_number (struct sfm_reader *r, double *d)
      static bool
      read_compressed_string (struct sfm_reader *r, char *dst)
      {
     -  switch (read_opcode (r))
     +  int opcode = read_opcode (r);
     +  switch (opcode)
          {
          case -1:
          case 252:
     @@ -1410,7 +1420,25 @@ read_compressed_string (struct sfm_reader *r, char *dst)
            break;
     
          default:
     -      sys_error (r, _("Compressed data is corrupt."));
     +      {
     +        double value = opcode - r->bias;
     +        float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
     +        if (value == 0.0)
     +          {
     +            /* This has actually been seen "in the wild".  The submitter of the
     +               file that showed that the contents decoded as spaces, but they
     +               were at the end of the field so it's possible that the null
     +               bytes just acted as null terminators. */
     +          }
     +        else if (!r->corruption_warning)
     +          {
     +            r->corruption_warning = true;
     +            sys_warn (r, _("Possible compressed data corruption: "
     +                           "string contains compressed integer (opcode %d)"),
     +                      opcode);
     +          }
     +      }
     +      break;
          }
     
        return true;
     
     --
     Peter Seebach on managing engineers:
     "It's like herding cats, only most of the engineers are already
      sick of laser pointers."
     
     
     _______________________________________________
     pspp-dev mailing list
     pspp-dev@...
     http://lists.gnu.org/mailman/listinfo/pspp-dev

--
PGP Public key ID: 1024D/2DE827B3
fingerprint = 8797 A26D 0854 2EAB 0285  A290 8A67 719C 2DE8 27B3
See http://pgp.mit.edu or any PGP keyserver for public key.




_______________________________________________
pspp-dev mailing list
pspp-dev@...
http://lists.gnu.org/mailman/listinfo/pspp-dev

signature.asc (196 bytes) Download Attachment

Re: fix for reading funny compressed data, for review

by Ben Pfaff :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Thanks.  I pushed it out.

John Darrington <john@...> writes:

> It seems reasonable to me.
>
> J'
>
> On Wed, Oct 14, 2009 at 09:44:52PM -0700, Ben Pfaff wrote:
>      I'd like to push this to the stable branch.  Comments
>      appreciated.
>      
>      commit e624e2da6ea68d22e6d4fba4eaa96d37d07a6730
>      Author: Ben Pfaff <blp@...>
>      Date:   Wed Oct 14 21:20:44 2009 -0700
>      
>          sys-file-reader: Tolerate nonsensical opcodes in compressed data.
>          
>          Compressed data in .sav files uses a set of 256 opcodes, some of which make
>          sense only for numeric data and others of which only make sense for string
>          data.  However, Jereme Thomas <jereme.thomas@...> has provided one
>          file, written by SPSS 14, that uses an opcode that seems to makes sense
>          only for numeric data in a string field.  So this commit adds support for
>          these opcodes, although it still warns about the ones other than the exact
>          one found in the file provided by Jereme.
>      
>      diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi
>      index 70fa385..b1be385 100644
>      --- a/doc/dev/system-file-format.texi
>      +++ b/doc/dev/system-file-format.texi
>      @@ -884,6 +884,9 @@ value @var{code} - @var{bias}, where
>       variable @code{bias} from the file header.  For example,
>       code 105 with bias 100.0 (the normal value) indicates a numeric variable
>       of value 5.
>      +One file has been seen written by SPSS 14 that contained such a code
>      +in a @emph{string} field with the value 0 (after the bias is
>      +subtracted) as a way of encoding null bytes.
>      
>       @item 252
>       End of file.  This code may or may not appear at the end of the data
>      diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c
>      index fe7b533..8d973e4 100644
>      --- a/src/data/sys-file-reader.c
>      +++ b/src/data/sys-file-reader.c
>      @@ -86,6 +86,7 @@ struct sfm_reader
>           double bias; /* Compression bias, usually 100.0. */
>           uint8_t opcodes[8];         /* Current block of opcodes. */
>           size_t opcode_idx;          /* Next opcode to interpret, 8 if none left. */
>      +    bool corruption_warning;    /* Warned about possible corruption? */
>         };
>      
>       static const struct casereader_class sys_file_casereader_class;
>      @@ -192,6 +193,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
>         r->oct_cnt = 0;
>         r->has_long_var_names = false;
>         r->opcode_idx = sizeof r->opcodes;
>      +  r->corruption_warning = false;
>      
>         /* TRANSLATORS: this fragment will be interpolated into
>            messages in fh_lock() that identify types of files. */
>      @@ -1374,7 +1376,14 @@ read_compressed_number (struct sfm_reader *r, double *d)
>             break;
>      
>           case 254:
>      -      sys_error (r, _("Compressed data is corrupt."));
>      +      float_convert (r->float_format, "        ", FLOAT_NATIVE_DOUBLE, d);
>      +      if (!r->corruption_warning)
>      +        {
>      +          r->corruption_warning = true;
>      +          sys_warn (r, _("Possible compressed data corruption: "
>      +                         "compressed spaces appear in numeric field."));
>      +        }
>      +      break;
>      
>           case 255:
>             *d = SYSMIS;
>      @@ -1395,7 +1404,8 @@ read_compressed_number (struct sfm_reader *r, double *d)
>       static bool
>       read_compressed_string (struct sfm_reader *r, char *dst)
>       {
>      -  switch (read_opcode (r))
>      +  int opcode = read_opcode (r);
>      +  switch (opcode)
>           {
>           case -1:
>           case 252:
>      @@ -1410,7 +1420,25 @@ read_compressed_string (struct sfm_reader *r, char *dst)
>             break;
>      
>           default:
>      -      sys_error (r, _("Compressed data is corrupt."));
>      +      {
>      +        double value = opcode - r->bias;
>      +        float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
>      +        if (value == 0.0)
>      +          {
>      +            /* This has actually been seen "in the wild".  The submitter of the
>      +               file that showed that the contents decoded as spaces, but they
>      +               were at the end of the field so it's possible that the null
>      +               bytes just acted as null terminators. */
>      +          }
>      +        else if (!r->corruption_warning)
>      +          {
>      +            r->corruption_warning = true;
>      +            sys_warn (r, _("Possible compressed data corruption: "
>      +                           "string contains compressed integer (opcode %d)"),
>      +                      opcode);
>      +          }
>      +      }
>      +      break;
>           }
>      
>         return true;
>      
>      --
>      Peter Seebach on managing engineers:
>      "It's like herding cats, only most of the engineers are already
>       sick of laser pointers."
>      
>      
>      _______________________________________________
>      pspp-dev mailing list
>      pspp-dev@...
>      http://lists.gnu.org/mailman/listinfo/pspp-dev

--
Peter Seebach on managing engineers:
"It's like herding cats, only most of the engineers are already
 sick of laser pointers."


_______________________________________________
pspp-dev mailing list
pspp-dev@...
http://lists.gnu.org/mailman/listinfo/pspp-dev