|
View:
New views
2 Messages
—
Rating Filter:
Alert me
|
|
|
join suggestion: auto-output-formatHello,
I'd like to suggest another small feature for join (not related to the '--header' feature I previously sent). This feature allows join to automatically guess the output format without specifying '-o', allowing easier use (IMHO) of "-e". This is mostly a convenience, DWIM kind of feature. Here a simple use case: $ cat 1.txt 1 alice 2 bob 4 dave $ cat 2.txt 1 red 2 green 3 blue Joining with "-a 1 -a 2" will display the third and fourth items without proper field 'fillers': $ join -j1 -a1 -a2 1.txt 2.txt 1 alice red 2 bob green 3 blue 4 dave This behavior is of course by design. If one needs the empty columns to be filled, it requires both "-e" and "-o", and to use "-o" properly, one needs to know beforehand the columns in the input files: $ join -j1 -a1 -a2 -e FOO -o 0,1.2,2.2 1.txt 2.txt 1 alice red 2 bob green 3 FOO blue 4 dave FOO If there are many columns in the input fields, writing the proper "-o" format string is cumbersome. I suggest a simple feature: When adding "--auto-format" argument, join will automatically generate an output format (simulating "-o"), by putting the joined field first, followed by all the fields from file1, followed by all fields from file2. (This feature assumes the number of columns in the first lines represents the number of columns in all lines). This allows using "-e" without specifying "-o", as so: $ join -j1 -a1 -a2 -e FOO --auto-format 1.txt 2.txt 1 alice red 2 bob green 3 FOO blue 4 dave FOO Attached is a first draft of this feature (also available here: http://cancan.cshl.edu/labmembers/gordon/coreutils8/join_auto_format.patch ). Comments are welcomed. Please tell me if you're willing to consider adding this feature to coreutils. Thanks, gordon src/join.c | 36 +++++++++++++++++++++++++++++++++++- 1 files changed, 35 insertions(+), 1 deletions(-) diff --git a/src/join.c b/src/join.c index d734a91..71219f9 100644 --- a/src/join.c +++ b/src/join.c @@ -146,6 +146,7 @@ static struct option const longopts[] = {"ignore-case", no_argument, NULL, 'i'}, {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, + {"auto-format", no_argument, NULL, 'F'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -157,6 +158,12 @@ static struct line uni_blank; /* If nonzero, ignore case when comparing join fields. */ static bool ignore_case; +/* if nonzero, automatically build a specific output field list, + based on the first line of each input file */ +static bool auto_output_format; + +static void build_output_format(const struct line const *line1, const struct line const* line2); + void usage (int status) { @@ -191,6 +198,8 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ --check-order check that the input is correctly sorted, even\n\ if all input lines are pairable\n\ --nocheck-order do not check that the input is correctly sorted\n\ + -F, --auto-format Automatically build output format, based on the first\n\ + line of each input file. Allows '-e' without using '-o'.\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); @@ -616,6 +625,9 @@ join (FILE *fp1, FILE *fp2) initseq (&seq2); getseq (fp2, &seq2, 2); + if (auto_output_format && seq1.count && seq2.count) + build_output_format(seq1.lines[0],seq2.lines[0]); + while (seq1.count && seq2.count) { size_t i; @@ -926,6 +938,24 @@ add_file_name (char *name, char *names[2], *optc_status = MIGHT_BE_O_ARG; } +static void +build_output_format(const struct line const *line1, const struct line const* line2) +{ + int i ; + if (outlist_head.next) + return; + + add_field(0,0); + for (i = 0; i < join_field_1 && i < line1->nfields; ++i) + add_field(1,i); + for (i = join_field_1 + 1; i < line1->nfields; ++i) + add_field(1,i); + for (i = 0; i < join_field_2 && i < line2->nfields; ++i) + add_field(2,i); + for (i = join_field_2 + 1; i < line2->nfields; ++i) + add_field(2,i); +} + int main (int argc, char **argv) { @@ -954,7 +984,7 @@ main (int argc, char **argv) issued_disorder_warning[0] = issued_disorder_warning[1] = false; check_input_order = CHECK_ORDER_DEFAULT; - while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:", + while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:F", longopts, NULL)) != -1) { @@ -1052,6 +1082,10 @@ main (int argc, char **argv) &nfiles, &prev_optc_status, &optc_status); break; + case 'F': + auto_output_format = true; + break; + case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); |
|
|
Re: join suggestion: auto-output-formatHello,
Here's an improved version of the 'auto-output-format' feature for join. Includes code,tests, NEWS and documentation. The patch is attached below, and is also available here: http://cancan.cshl.edu/labmembers/gordon/coreutils8/join_auto_format.patch See email below for a use-case example. Thanks, -gordon Assaf Gordon wrote, On 11/04/2009 08:36 PM: > > This feature allows join to automatically guess the output format > without specifying '-o', allowing easier use (IMHO) of "-e". This is > mostly a convenience, DWIM kind of feature. > Here a simple use case: > > $ cat 1.txt > 1 alice > 2 bob > 4 dave > > $ cat 2.txt > 1 red > 2 green > 3 blue > > Joining with "-a 1 -a 2" will display the third and fourth items without > proper field 'fillers': > > $ join -j1 -a1 -a2 1.txt 2.txt > 1 alice red > 2 bob green > 3 blue > 4 dave > > This behavior is of course by design. > If one needs the empty columns to be filled, it requires both "-e" and > "-o", and to use "-o" properly, one needs to know beforehand the columns > in the input files: > > $ join -j1 -a1 -a2 -e FOO -o 0,1.2,2.2 1.txt 2.txt > 1 alice red > 2 bob green > 3 FOO blue > 4 dave FOO > > If there are many columns in the input fields, writing the proper "-o" > format string is cumbersome. > > I suggest a simple feature: > When adding "--auto-format" argument, join will automatically generate > an output format (simulating "-o"), by putting the joined field first, > followed by all the fields from file1, followed by all fields from file2. > (This feature assumes the number of columns in the first lines > represents the number of columns in all lines). > This allows using "-e" without specifying "-o", as so: > > $ join -j1 -a1 -a2 -e FOO --auto-format 1.txt 2.txt > 1 alice red > 2 bob green > 3 FOO blue > 4 dave FOO > NEWS | 3 +++ doc/coreutils.texi | 10 ++++++++++ src/join.c | 36 +++++++++++++++++++++++++++++++++++- tests/misc/join | 21 +++++++++++++++++++++ 4 files changed, 69 insertions(+), 1 deletions(-) diff --git a/NEWS b/NEWS index 5b75dbb..8655faa 100644 --- a/NEWS +++ b/NEWS @@ -77,6 +77,9 @@ GNU coreutils NEWS -*- outline -*- touch now accepts the option --no-dereference (-h), as a means to change symlink timestamps on platforms with enough support. + join now accepts the option --auto-format (-F), to automatically + detect the output format without requireing explicit -o. + * Noteworthy changes in release 8.0 (2009-10-06) [beta] diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 227014c..f692f47 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -5512,6 +5512,16 @@ Do not check that both input files are in sorted order. This is the default. Replace those output fields that are missing in the input with @var{string}. +@item -F +@itemx --auto-format +@opindex -i +@opindex --auto-format +Automatically detects output format based on the number of fields in the +first line of each input file (as if the user explicitly specified @samp{-o}). +Allows using @samp{-e} without a-priori knowledge of the fields in the input files. +The join field is printed first, followed by the remaining fields from the first +file and the second file. + @item -i @itemx --ignore-case @opindex -i diff --git a/src/join.c b/src/join.c index d734a91..07112eb 100644 --- a/src/join.c +++ b/src/join.c @@ -146,6 +146,7 @@ static struct option const longopts[] = {"ignore-case", no_argument, NULL, 'i'}, {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, + {"auto-format", no_argument, NULL, 'F'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -157,6 +158,12 @@ static struct line uni_blank; /* If nonzero, ignore case when comparing join fields. */ static bool ignore_case; +/* if nonzero, automatically build a specific output field list, + based on the first line of each input file */ +static bool auto_output_format; + +static void build_output_format(struct line const *line1, struct line const* line2); + void usage (int status) { @@ -191,6 +198,8 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ --check-order check that the input is correctly sorted, even\n\ if all input lines are pairable\n\ --nocheck-order do not check that the input is correctly sorted\n\ + -F, --auto-format Automatically build output format, based on the first\n\ + line of each input file. Allows '-e' without using '-o'.\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); @@ -616,6 +625,9 @@ join (FILE *fp1, FILE *fp2) initseq (&seq2); getseq (fp2, &seq2, 2); + if (auto_output_format && seq1.count && seq2.count) + build_output_format(seq1.lines[0],seq2.lines[0]); + while (seq1.count && seq2.count) { size_t i; @@ -926,6 +938,24 @@ add_file_name (char *name, char *names[2], *optc_status = MIGHT_BE_O_ARG; } +static void +build_output_format(struct line const *line1, struct line const* line2) +{ + int i ; + if (outlist_head.next) + return; + + add_field(0,0); + for (i = 0; i < join_field_1 && i < line1->nfields; ++i) + add_field(1,i); + for (i = join_field_1 + 1; i < line1->nfields; ++i) + add_field(1,i); + for (i = 0; i < join_field_2 && i < line2->nfields; ++i) + add_field(2,i); + for (i = join_field_2 + 1; i < line2->nfields; ++i) + add_field(2,i); +} + int main (int argc, char **argv) { @@ -954,7 +984,7 @@ main (int argc, char **argv) issued_disorder_warning[0] = issued_disorder_warning[1] = false; check_input_order = CHECK_ORDER_DEFAULT; - while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:", + while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:F", longopts, NULL)) != -1) { @@ -1052,6 +1082,10 @@ main (int argc, char **argv) &nfiles, &prev_optc_status, &optc_status); break; + case 'F': + auto_output_format = true; + break; + case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); diff --git a/tests/misc/join b/tests/misc/join index d1f1677..3f1e590 100755 --- a/tests/misc/join +++ b/tests/misc/join @@ -185,6 +185,27 @@ my @tv = ( # Before 6.10.143, this would mistakenly fail with the diagnostic: # join: File 1 is not in sorted order ['chkodr-7', '-12', ["2 a\n1 b\n", ""], "", 0], + +# Auto-format +['autoformat-1', '-j1 -a1 -a2 -F -e FOO', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b FOO\n3 FOO Y\n", 0], + +# Auto-format, with empty filler (no '-e' specified)- +# should print a column delimiters (space characters), but no filler. +# This should be equivalent to specifing "-o 0,1.2,2.2" without "-e". +['autoformat-2', '-j1 -a1 -a2 -F', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b \n3 Y\n", 0], + +# auto-format sanity check: specify explicit output format without -e, +# make sure it matches the above test. +['autoformat-3', '-j1 -a1 -a2 -o 0,1.2,2.2', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b \n3 Y\n", 0], + +# both auto-format and explicit output format (different format than 'auto'), +# auto-format should be silently ignored. +['autoformat-4', '-j1 -a1 -a2 -e FOO -F -o 0,2.2,1.2', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 X a\n2 FOO b\n3 Y FOO\n", 0], + ); # Convert the above old-style test vectors to the newer |
| Free embeddable forum powered by Nabble | Forum Help |