Skip to content

Commit baade3e

Browse files
author
dmh
committed
[NCF-293]
Allow .cdl files to have a leading utf-8 BOM. Also add test.
1 parent deeca5f commit baade3e

File tree

6 files changed

+124
-5
lines changed

6 files changed

+124
-5
lines changed

configure.ac

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -624,7 +624,7 @@ AC_HEADER_STDBOOL
624624
# Check for these functions...
625625
AC_CHECK_FUNCS([strlcat strerror snprintf strchr strrchr strcat strcpy \
626626
strdup strcasecmp strtod strtoll strtoull strstr \
627-
mkstemp rand \
627+
mkstemp rand memcmp \
628628
getrlimit gettimeofday fsync MPI_Comm_f2c])
629629

630630
# Does the user want to use NC_DISKLESS?

ncdump/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ TARGET_LINK_LIBRARIES(ncdump netcdf ${ALL_TLL_LIBS})
3737
TARGET_LINK_LIBRARIES(nccopy netcdf ${ALL_TLL_LIBS})
3838

3939
IF(ENABLE_TESTS)
40-
ADD_EXECUTABLE(rewrite-scalar rewrite-scalar.c)
40+
ADD_EXECUTABLE(rewrite-scalar rewrite-scalar.c bom.c)
4141
TARGET_LINK_LIBRARIES(rewrite-scalar netcdf)
4242
# Base tests
4343
# The tests are set up as a combination of shell scripts and executables that
@@ -58,8 +58,8 @@ IF(ENABLE_TESTS)
5858
add_sh_test(ncdump tst_charfill)
5959
add_sh_test(ncdump tst_iter)
6060
add_sh_test(ncdump tst_formatx3)
61+
add_sh_test(ncdump tst_bom)
6162

62-
6363
IF(EXTRA_TESTS)
6464
add_sh_test(ncdump run_back_comp_tests)
6565
ENDIF()

ncdump/Makefile.am

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@ man_MANS = ncdump.1 nccopy.1
2828
if BUILD_TESTSETS
2929
#if !BUILD_DLL
3030
# These tests are run for both netCDF-4 and non-netCDF-4 builds.
31-
check_PROGRAMS = rewrite-scalar ctest ctest64 ncdump tst_utf8
31+
check_PROGRAMS = rewrite-scalar ctest ctest64 ncdump tst_utf8 bom
3232
TESTS = run_tests.sh tst_64bit.sh ctest ctest64 tst_output.sh \
3333
tst_lengths.sh tst_calendars.sh tst_utf8 run_utf8_tests.sh \
34-
tst_nccopy3.sh tst_charfill.sh tst_iter.sh tst_formatx3.sh
34+
tst_nccopy3.sh tst_charfill.sh tst_iter.sh tst_formatx3.sh tst_bom.sh
3535

3636
if LARGE_FILE_TESTS
3737
TESTS += tst_iter.sh

ncdump/bom.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*********************************************************************
2+
* Copyright 1993, UCAR/Unidata
3+
* See netcdf/COPYRIGHT file for copying and redistribution conditions.
4+
*********************************************************************/
5+
6+
#include <config.h>
7+
#include <stdlib.h>
8+
#include <stdio.h>
9+
10+
/* BOM Sequences */
11+
static char* U8 = "\xEF\xBB\xBF"; /* UTF-8 */
12+
static char* BE32 = "\x00\x00\xFE\xFF"; /* UTF-32; big-endian */
13+
static char* LE32 = "\xFF\xFE"; /* UTF-32; little-endian */
14+
static char* BE16 = "\xFE\xFF"; /* UTF-16; big-endian */
15+
static char* LE16 = "\xFF\xFE"; /* UTF-16; little-endian */
16+
17+
int
18+
main(int argc, char** argv)
19+
{
20+
char* bom = U8;
21+
int bomlen = 3;
22+
if(argc > 1 && strlen(argv[1]) > 0) {
23+
char* which = argv[1];
24+
switch (which[0]) {
25+
case '1': bom = BE16; bomlen = 2; break;
26+
case '3': bom = BE32; bomlen = 2; break;
27+
default: break;
28+
}
29+
}
30+
fwrite(bom,1,bomlen,stdout);
31+
exit(0);
32+
}
33+

ncdump/tst_bom.sh

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/bin/sh
2+
# This shell script tests BOM support in ncgen
3+
4+
set -e
5+
6+
if test "x$srcdir" = "x"; then
7+
srcdir=`dirname $0`;
8+
fi
9+
# add hack for sunos
10+
export srcdir;
11+
12+
echo ""
13+
14+
rm -f tst_bom.cdl tmp.cdl tst_bom8.* tst_bom16.*
15+
16+
cat <<EOF >>tst_bom.cdl
17+
netcdf tst_bom {
18+
variables:
19+
float f;
20+
data:
21+
22+
f = 1;
23+
}
24+
EOF
25+
26+
echo "*** Generate a cdl file with leading UTF-8 BOM."
27+
./bom 8 >tst_bom8.cdl
28+
cat tst_bom.cdl >> tst_bom8.cdl
29+
30+
echo "*** Verify .nc file"
31+
../ncgen/ncgen -k1 -o tst_bom8.nc tst_bom8.cdl
32+
../ncdump/ncdump -n tst_bom tst_bom8.nc > tmp.cdl
33+
diff -w tst_bom.cdl tmp.cdl
34+
35+
# Do it again but with Big-Endian 16; should fail
36+
37+
rm -f tmp.cdl tst_bom8.* tst_bom16.*
38+
39+
echo "*** Generate a cdl file with leading UTF-16 BOM."
40+
./bom 16 >tst_bom16.cdl
41+
cat tst_bom.cdl >> tst_bom16.cdl
42+
43+
echo "*** Verify UTF-16 file fails"
44+
if ../ncgen/ncgen -k1 -o tst_bom16.nc tst_bom16.cdl ; then
45+
echo 'BOM Big Endian 16 succeeded, but should not'
46+
exit 1
47+
else
48+
echo '***XFAIL: BOM Big Endian 16'
49+
fi
50+
51+
# Cleanup
52+
rm -f tst_bom.cdl tmp.cdl tst_bom8.* tst_bom16.*
53+
54+
exit 0

ncgen/main.c

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,13 @@ struct Languages legallanguages[] = {
124124
};
125125
#endif
126126

127+
/* BOM Sequences */
128+
static char* U8 = "\xEF\xBB\xBF"; /* UTF-8 */
129+
static char* BE32 = "\x00\x00\xFE\xFF"; /* UTF-32; big-endian */
130+
static char* LE32 = "\xFF\xFE"; /* UTF-32; little-endian */
131+
static char* BE16 = "\xFE\xFF"; /* UTF-16; big-endian */
132+
static char* LE16 = "\xFF\xFE"; /* UTF-16; little-endian */
133+
127134
/* The default minimum iterator size depends
128135
on whether we are doing binary or language
129136
based output.
@@ -371,11 +378,36 @@ main(
371378

372379
fp = stdin;
373380
if (argc > 0 && strcmp(argv[0], "-") != 0) {
381+
char bom[4];
382+
size_t count;
374383
if ((fp = fopen(argv[0], "r")) == NULL) {
375384
derror ("can't open file %s for reading: ", argv[0]);
376385
perror("");
377386
return(7);
378387
}
388+
/* Check the leading bytes for an occurrence of a BOM */
389+
/* re: http://www.unicode.org/faq/utf_bom.html#BOM */
390+
/* Attempt to read the first four bytes */
391+
memset(bom,0,sizeof(bom));
392+
count = fread(bom,1,2,fp);
393+
if(count == 2) {
394+
switch (bom[0]) {
395+
case '\x00':
396+
case '\xFF':
397+
case '\xFE':
398+
/* Only UTF-* is allowed; complain and exit */
399+
fprintf(stderr,"Input file contains a BOM indicating a non-UTF8 encoding\n");
400+
return 1;
401+
case '\xEF':
402+
/* skip the BOM */
403+
fread(bom,1,1,fp);
404+
break;
405+
default: /* legal printable char, presumably; rewind */
406+
rewind(fp);
407+
break;
408+
}
409+
}
410+
379411
cdlname = (char*)emalloc(NC_MAX_NAME);
380412
cdlname = nulldup(argv[0]);
381413
if(strlen(cdlname) > NC_MAX_NAME) cdlname[NC_MAX_NAME] = '\0';

0 commit comments

Comments
 (0)