mirror of
https://git.savannah.gnu.org/git/coreutils.git
synced 2025-09-10 07:59:52 +02:00
fold: add the --characters option
* src/fold.c: Include mcel.h. (count_bytes): Remove variable. (counting_mode, last_character_width): New variables. (shortopts, long_options): Add the option. (adjust_column): If --characters is in used account for number of characters instead of their width. (fold_file): Use getline and iterate over the result with mcel functions to handle multibyte characters. (main): Check for the option. * src/local.mk (src_fold_LDADD): Add $(LIBC32CONV), $(LIBUNISTRING), and $(MBRTOWC_LIB). * tests/fold/fold-characters.sh: New file. * tests/fold/fold-spaces.sh: New file. * tests/fold/fold-nbsp.sh: New file. * tests/local.mk (all_tests): Add the tests. * NEWS: Mention the new option. * doc/coreutils.texi (fold invocation): Likewise.
This commit is contained in:
3
NEWS
3
NEWS
@@ -83,6 +83,9 @@ GNU coreutils NEWS -*- outline -*-
|
||||
Iranian locale (fa_IR) and for the Ethiopian locale (am_ET), and also
|
||||
does so more consistently for the Thailand locale (th_TH.UTF-8).
|
||||
|
||||
fold now supports the --characters (-c) option to count multi-byte
|
||||
characters instead of the number of columns.
|
||||
|
||||
nproc now honors any cgroup v2 configured CPU quotas,
|
||||
which may reduce the effective number of processors available.
|
||||
|
||||
|
||||
@@ -2964,6 +2964,13 @@ Count bytes rather than columns, so that tabs, backspaces, and carriage
|
||||
returns are each counted as taking up one column, just like other
|
||||
characters.
|
||||
|
||||
@item -c
|
||||
@itemx --characters
|
||||
@opindex -c
|
||||
@opindex --characters
|
||||
Count characters rather than columns, meaning that lines containing
|
||||
characters wider than one column will be visually longer.
|
||||
|
||||
@item -s
|
||||
@itemx --spaces
|
||||
@opindex -s
|
||||
|
||||
175
src/fold.c
175
src/fold.c
@@ -25,6 +25,7 @@
|
||||
|
||||
#include "system.h"
|
||||
#include "fadvise.h"
|
||||
#include "mcel.h"
|
||||
#include "xdectoint.h"
|
||||
|
||||
#define TAB_WIDTH 8
|
||||
@@ -37,17 +38,26 @@
|
||||
/* If nonzero, try to break on whitespace. */
|
||||
static bool break_spaces;
|
||||
|
||||
/* If nonzero, count bytes, not column positions. */
|
||||
static bool count_bytes;
|
||||
/* Mode to operate in. */
|
||||
static enum
|
||||
{
|
||||
COUNT_COLUMNS,
|
||||
COUNT_BYTES,
|
||||
COUNT_CHARACTERS
|
||||
} counting_mode = COUNT_COLUMNS;
|
||||
|
||||
/* If nonzero, at least one of the files we read was standard input. */
|
||||
static bool have_read_stdin;
|
||||
|
||||
static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
|
||||
/* Width of last read character. */
|
||||
static int last_character_width = 0;
|
||||
|
||||
static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
|
||||
|
||||
static struct option const longopts[] =
|
||||
{
|
||||
{"bytes", no_argument, nullptr, 'b'},
|
||||
{"characters", no_argument, nullptr, 'c'},
|
||||
{"spaces", no_argument, nullptr, 's'},
|
||||
{"width", required_argument, nullptr, 'w'},
|
||||
{GETOPT_HELP_OPTION_DECL},
|
||||
@@ -75,6 +85,7 @@ Wrap input lines in each FILE, writing to standard output.\n\
|
||||
|
||||
fputs (_("\
|
||||
-b, --bytes count bytes rather than columns\n\
|
||||
-c, --characters count characters rather than columns\n\
|
||||
-s, --spaces break at spaces\n\
|
||||
-w, --width=WIDTH use WIDTH columns instead of 80\n\
|
||||
"), stdout);
|
||||
@@ -90,24 +101,28 @@ Wrap input lines in each FILE, writing to standard output.\n\
|
||||
The first column is 0. */
|
||||
|
||||
static size_t
|
||||
adjust_column (size_t column, char c)
|
||||
adjust_column (size_t column, mcel_t g)
|
||||
{
|
||||
if (!count_bytes)
|
||||
if (counting_mode != COUNT_BYTES)
|
||||
{
|
||||
if (c == '\b')
|
||||
if (g.ch == '\b')
|
||||
{
|
||||
if (column > 0)
|
||||
column--;
|
||||
column -= last_character_width;
|
||||
}
|
||||
else if (c == '\r')
|
||||
else if (g.ch == '\r')
|
||||
column = 0;
|
||||
else if (c == '\t')
|
||||
else if (g.ch == '\t')
|
||||
column += TAB_WIDTH - column % TAB_WIDTH;
|
||||
else /* if (isprint (c)) */
|
||||
column++;
|
||||
else /* if (c32isprint (g.ch)) */
|
||||
{
|
||||
last_character_width = (counting_mode == COUNT_CHARACTERS
|
||||
? 1 : c32width (g.ch));
|
||||
column += last_character_width;
|
||||
}
|
||||
}
|
||||
else
|
||||
column++;
|
||||
column += g.len;
|
||||
return column;
|
||||
}
|
||||
|
||||
@@ -119,11 +134,13 @@ static bool
|
||||
fold_file (char const *filename, size_t width)
|
||||
{
|
||||
FILE *istream;
|
||||
int c;
|
||||
size_t column = 0; /* Screen column where next char will go. */
|
||||
idx_t offset_out = 0; /* Index in 'line_out' for next char. */
|
||||
static char *line_out = nullptr;
|
||||
static idx_t allocated_out = 0;
|
||||
static char *line_in = nullptr;
|
||||
static size_t allocated_in = 0;
|
||||
static ssize_t length_in = 0;
|
||||
int saved_errno;
|
||||
|
||||
if (STREQ (filename, "-"))
|
||||
@@ -142,74 +159,90 @@ fold_file (char const *filename, size_t width)
|
||||
|
||||
fadvise (istream, FADVISE_SEQUENTIAL);
|
||||
|
||||
while ((c = getc (istream)) != EOF)
|
||||
while (0 <= (length_in = getline (&line_in, &allocated_in, istream)))
|
||||
{
|
||||
if (allocated_out - offset_out <= 1)
|
||||
line_out = xpalloc (line_out, &allocated_out, 1, -1, sizeof *line_out);
|
||||
|
||||
if (c == '\n')
|
||||
char *p = line_in;
|
||||
char *lim = p + length_in;
|
||||
mcel_t g;
|
||||
for (; p < lim; p += g.len)
|
||||
{
|
||||
line_out[offset_out++] = c;
|
||||
fwrite (line_out, sizeof (char), offset_out, stdout);
|
||||
column = offset_out = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
rescan:
|
||||
column = adjust_column (column, c);
|
||||
|
||||
if (column > width)
|
||||
{
|
||||
/* This character would make the line too long.
|
||||
Print the line plus a newline, and make this character
|
||||
start the next line. */
|
||||
if (break_spaces)
|
||||
g = mcel_scan (p, lim);
|
||||
if (allocated_out - offset_out <= g.len)
|
||||
line_out = xpalloc (line_out, &allocated_out, g.len, -1,
|
||||
sizeof *line_out);
|
||||
if (g.ch == '\n')
|
||||
{
|
||||
bool found_blank = false;
|
||||
idx_t logical_end = offset_out;
|
||||
memcpy (line_out + offset_out, p, g.len);
|
||||
offset_out += g.len;
|
||||
fwrite (line_out, sizeof (char), offset_out, stdout);
|
||||
column = offset_out = 0;
|
||||
continue;
|
||||
}
|
||||
rescan:
|
||||
column = adjust_column (column, g);
|
||||
|
||||
/* Look for the last blank. */
|
||||
while (logical_end)
|
||||
if (column > width)
|
||||
{
|
||||
/* This character would make the line too long.
|
||||
Print the line plus a newline, and make this character
|
||||
start the next line. */
|
||||
if (break_spaces)
|
||||
{
|
||||
--logical_end;
|
||||
if (isblank (to_uchar (line_out[logical_end])))
|
||||
int space_length = 0;
|
||||
idx_t logical_end = offset_out;
|
||||
char *logical_p = line_out;
|
||||
char *logical_lim = logical_p + logical_end;
|
||||
|
||||
for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
|
||||
{
|
||||
found_blank = true;
|
||||
break;
|
||||
g2 = mcel_scan (logical_p, logical_lim);
|
||||
if (c32isblank (g2.ch))
|
||||
{
|
||||
space_length = g2.len;
|
||||
logical_end = logical_p - line_out;
|
||||
}
|
||||
}
|
||||
|
||||
if (space_length)
|
||||
{
|
||||
logical_end += space_length;
|
||||
/* Found a blank. Don't output the part after it. */
|
||||
fwrite (line_out, sizeof (char), logical_end, stdout);
|
||||
putchar ('\n');
|
||||
/* Move the remainder to the beginning of the next line.
|
||||
The areas being copied here might overlap. */
|
||||
memmove (line_out, line_out + logical_end,
|
||||
offset_out - logical_end);
|
||||
offset_out -= logical_end;
|
||||
column = 0;
|
||||
char *printed_p = line_out;
|
||||
char *printed_lim = printed_p + offset_out;
|
||||
for (mcel_t g2; printed_p < printed_lim;
|
||||
printed_p += g2.len)
|
||||
{
|
||||
g2 = mcel_scan (printed_p, printed_lim);
|
||||
column = adjust_column (column, g2);
|
||||
}
|
||||
goto rescan;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_blank)
|
||||
if (offset_out == 0)
|
||||
{
|
||||
/* Found a blank. Don't output the part after it. */
|
||||
logical_end++;
|
||||
fwrite (line_out, sizeof (char), logical_end, stdout);
|
||||
putchar ('\n');
|
||||
/* Move the remainder to the beginning of the next line.
|
||||
The areas being copied here might overlap. */
|
||||
memmove (line_out, line_out + logical_end,
|
||||
offset_out - logical_end);
|
||||
offset_out -= logical_end;
|
||||
column = 0;
|
||||
for (idx_t i = 0; i < offset_out; i++)
|
||||
column = adjust_column (column, line_out[i]);
|
||||
goto rescan;
|
||||
memcpy (line_out + offset_out, p, g.len);
|
||||
offset_out += g.len;
|
||||
continue;
|
||||
}
|
||||
|
||||
line_out[offset_out++] = '\n';
|
||||
fwrite (line_out, sizeof (char), offset_out, stdout);
|
||||
column = offset_out = 0;
|
||||
goto rescan;
|
||||
}
|
||||
|
||||
if (offset_out == 0)
|
||||
{
|
||||
line_out[offset_out++] = c;
|
||||
continue;
|
||||
}
|
||||
|
||||
line_out[offset_out++] = '\n';
|
||||
fwrite (line_out, sizeof (char), offset_out, stdout);
|
||||
column = offset_out = 0;
|
||||
goto rescan;
|
||||
memcpy (line_out + offset_out, p, g.len);
|
||||
offset_out += g.len;
|
||||
}
|
||||
|
||||
line_out[offset_out++] = c;
|
||||
}
|
||||
|
||||
saved_errno = errno;
|
||||
@@ -249,7 +282,7 @@ main (int argc, char **argv)
|
||||
|
||||
atexit (close_stdout);
|
||||
|
||||
break_spaces = count_bytes = have_read_stdin = false;
|
||||
break_spaces = have_read_stdin = false;
|
||||
|
||||
while ((optc = getopt_long (argc, argv, shortopts, longopts, nullptr)) != -1)
|
||||
{
|
||||
@@ -258,7 +291,11 @@ main (int argc, char **argv)
|
||||
switch (optc)
|
||||
{
|
||||
case 'b': /* Count bytes rather than columns. */
|
||||
count_bytes = true;
|
||||
counting_mode = COUNT_BYTES;
|
||||
break;
|
||||
|
||||
case 'c': /* Count characters rather than columns. */
|
||||
counting_mode = COUNT_CHARACTERS;
|
||||
break;
|
||||
|
||||
case 's': /* Break at word boundaries. */
|
||||
|
||||
@@ -336,6 +336,9 @@ src_sort_LDADD += $(LIBPMULTITHREAD)
|
||||
# for pthread_sigmask
|
||||
src_sort_LDADD += $(PTHREAD_SIGMASK_LIB)
|
||||
|
||||
# for mbrtowc, mbfile
|
||||
src_fold_LDADD += $(LIBC32CONV) $(LIBUNISTRING) $(MBRTOWC_LIB)
|
||||
|
||||
# Get the release year from lib/version-etc.c.
|
||||
RELEASE_YEAR = \
|
||||
`sed -n '/.*COPYRIGHT_YEAR = \([0-9][0-9][0-9][0-9]\) };/s//\1/p' \
|
||||
|
||||
61
tests/fold/fold-characters.sh
Executable file
61
tests/fold/fold-characters.sh
Executable file
@@ -0,0 +1,61 @@
|
||||
#!/bin/sh
|
||||
# Test fold --characters.
|
||||
|
||||
# Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
|
||||
print_ver_ fold printf
|
||||
|
||||
test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available"
|
||||
|
||||
LC_ALL=$LOCALE_FR_UTF8
|
||||
export LC_ALL
|
||||
|
||||
# The string "뉐뉐뉐" is 3 characters, but occupies 6 columns.
|
||||
env printf '\uB250\uB250\uB250\n' > input1 || framework_failure_
|
||||
env printf '\uB250\uB250\n\uB250\n' > column-exp1 || framework_failure_
|
||||
|
||||
fold -w 5 input1 > column-out1 || fail=1
|
||||
compare column-exp1 column-out1 || fail=1
|
||||
|
||||
# Should be the same as the input.
|
||||
fold --characters -w 5 input1 > characters-out1 || fail=1
|
||||
compare input1 characters-out1 || fail=1
|
||||
|
||||
# Test with 50 2 column wide characters.
|
||||
for i in $(seq 50); do
|
||||
env printf '\uFF1A' >> input2 || framework_failure_
|
||||
env printf '\uFF1A' >> column-exp2 || framework_failure_
|
||||
env printf '\uFF1A' >> character-exp2 || framework_failure_
|
||||
if test $(($i % 5)) -eq 0; then
|
||||
env printf '\n' >> column-exp2 || framework_failure_
|
||||
fi
|
||||
if test $(($i % 10)) -eq 0; then
|
||||
env printf '\n' >> character-exp2 || framework_failure_
|
||||
fi
|
||||
done
|
||||
|
||||
env printf '\n' >> input2 || framework_failure_
|
||||
|
||||
# 5 characters per line.
|
||||
fold -w 10 input2 > column-out2 || fail=1
|
||||
compare column-exp2 column-out2 || fail=1
|
||||
|
||||
# 10 characters per line.
|
||||
fold --characters -w 10 input2 > character-out2 || fail=1
|
||||
compare character-exp2 character-out2 || fail=1
|
||||
|
||||
Exit $fail
|
||||
41
tests/fold/fold-nbsp.sh
Executable file
41
tests/fold/fold-nbsp.sh
Executable file
@@ -0,0 +1,41 @@
|
||||
#!/bin/sh
|
||||
# Test fold --spaces with various Unicode non-breaking space characters.
|
||||
|
||||
# Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
|
||||
print_ver_ fold printf
|
||||
|
||||
test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available"
|
||||
|
||||
LC_ALL=$LOCALE_FR_UTF8
|
||||
export LC_ALL
|
||||
|
||||
# One U+2007 FIGURE SPACE characters.
|
||||
env printf 'abcdefghijklmnop\u2007qrstuvwxyz\n' > input1 || framework_failure_
|
||||
env printf 'abcdefghij\nklmnop\u2007qrs\ntuvwxyz\n'> exp1 || framework_failure_
|
||||
fold --spaces --width 10 input1 > out1 || fail=1
|
||||
compare exp1 out1 || fail=1
|
||||
|
||||
# Two U+00A0 NO-BREAK SPACE characters.
|
||||
env printf 'abcdefghijklmnop\u00A0\u00A0qrstuvwxyz\n' > input2 \
|
||||
|| framework_failure_
|
||||
env printf 'abcdefghij\nklmnop\u00A0\u00A0qr\nstuvwxyz\n'> exp2 \
|
||||
|| framework_failure_
|
||||
fold --spaces --width 10 input2 > out2 || fail=1
|
||||
compare exp2 out2 || fail=1
|
||||
|
||||
Exit $fail
|
||||
41
tests/fold/fold-spaces.sh
Executable file
41
tests/fold/fold-spaces.sh
Executable file
@@ -0,0 +1,41 @@
|
||||
#!/bin/sh
|
||||
# Test fold --spaces with various Unicode breaking space characters.
|
||||
|
||||
# Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
|
||||
print_ver_ fold printf
|
||||
|
||||
test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available"
|
||||
|
||||
LC_ALL=$LOCALE_FR_UTF8
|
||||
export LC_ALL
|
||||
|
||||
# One U+2002 EN SPACE characters.
|
||||
env printf 'abcdefghijklmnop\u2002qrstuvwxyz\n' > input1 || framework_failure_
|
||||
env printf 'abcdefghij\nklmnop\u2002\nqrstuvwxyz\n'> exp1 || framework_failure_
|
||||
fold --spaces --width 10 input1 > out1 || fail=1
|
||||
compare exp1 out1 || fail=1
|
||||
|
||||
# Two U+2003 EM SPACE characters.
|
||||
env printf 'abcdefghijklmnop\u2003\u2003qrstuvwxyz\n' > input2 \
|
||||
|| framework_failure_
|
||||
env printf 'abcdefghij\nklmnop\u2003\u2003\nqrstuvwxyz\n'> exp2 \
|
||||
|| framework_failure_
|
||||
fold --spaces --width 10 input2 > out2 || fail=1
|
||||
compare exp2 out2 || fail=1
|
||||
|
||||
Exit $fail
|
||||
@@ -326,6 +326,9 @@ all_tests = \
|
||||
tests/factor/factor.pl \
|
||||
tests/factor/factor-parallel.sh \
|
||||
tests/misc/false-status.sh \
|
||||
tests/fold/fold-characters.sh \
|
||||
tests/fold/fold-nbsp.sh \
|
||||
tests/fold/fold-spaces.sh \
|
||||
tests/misc/fold.pl \
|
||||
tests/groups/groups-dash.sh \
|
||||
tests/groups/groups-process-all.sh \
|
||||
|
||||
Reference in New Issue
Block a user