Skip to content

Crash when opening same NC4 file from different threads BUT under global mutex #2496

@rouault

Description

@rouault

This is an attempt at providing a minimum reproducer for OSGeo/gdal#6253

The attached docker.zip contains a simple Dockerfile building libhdf5 and libnetcdf, and a simple C++ program.
The C++ program loops at creating 2 threads, which open the same NC4 file and one calls nc_inq_varname(). It is to be noted that all calls to the netCDF API are protected by a common mutex , so there's no concurrent access to the netCDF API.

I've tried different versions of hdf5 and netcdf, and compiling hdf5 with or without --enable-unsupported --enable-threadsafe, but the crash always occur

How to reproduce:

unzip docker.zip
cd docker
docker build -t netcdf_issue .
docker run --rm -it netcdf_issue:latest  /usr/bin/test /alldatatypes.nc
docker run --rm -it netcdf_issue:latest  valgrind /usr/bin/test /alldatatypes.nc || echo "this failed!"

results in

T1 begin
T2 begin
T1 open 65536
T2 open 131072
T1 close 65536
T1 end
T2 begin nc_inq_varname(131072, 15)
this failed!

and under valgrind:

$ docker run --rm -it netcdf_issue:latest  valgrind /usr/bin/test /alldatatypes.nc
[...]
==1== Thread 3:
==1== Invalid read of size 1
==1==    at 0x4F921F8: H5F_addr_decode (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x519A1F6: H5VL__native_blob_specific (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x51922AC: H5VL_blob_specific (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x517F48F: H5T__vlen_disk_isnull (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x510387B: H5T__conv_vlen (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x50EDD6B: H5T_convert (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x4F57159: H5D_get_create_plist (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x519B3D4: H5VL__native_dataset_get (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x5186A77: H5VL_dataset_get (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x4F2772A: H5Dget_create_plist (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x48EDCB5: nc4_get_var_meta (in /usr/lib/libnetcdf.so.19.1.0)
==1==    by 0x48EAC09: nc4_hdf5_find_grp_var_att (in /usr/lib/libnetcdf.so.19.1.0)
==1==  Address 0x40 is not stack'd, malloc'd or (recently) free'd
==1== 
==1== 
==1== Process terminating with default action of signal 11 (SIGSEGV): dumping core
==1==  Access not within mapped region at address 0x40
==1==    at 0x4F921F8: H5F_addr_decode (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x519A1F6: H5VL__native_blob_specific (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x51922AC: H5VL_blob_specific (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x517F48F: H5T__vlen_disk_isnull (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x510387B: H5T__conv_vlen (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x50EDD6B: H5T_convert (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x4F57159: H5D_get_create_plist (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x519B3D4: H5VL__native_dataset_get (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x5186A77: H5VL_dataset_get (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x4F2772A: H5Dget_create_plist (in /usr/lib/libhdf5.so.302.0.0)
==1==    by 0x48EDCB5: nc4_get_var_meta (in /usr/lib/libnetcdf.so.19.1.0)
==1==    by 0x48EAC09: nc4_hdf5_find_grp_var_att (in /usr/lib/libnetcdf.so.19.1.0)
==1==  If you believe this happened as a result of a stack

Dockerfile:

FROM ubuntu:20.04
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --fix-missing --no-install-recommends g++ make autoconf automake zlib1g-dev wget tar ca-certificates
RUN wget https://github.com/HDFGroup/hdf5/archive/refs/tags/hdf5-1_13_2.tar.gz && \
    tar xvzf hdf5-1_13_2.tar.gz && \
    cd hdf5-hdf5-1_13_2 && \
    #  --enable-unsupported --enable-threadsafe
    ./configure --prefix=/usr --disable-static --disable-tests && \
    make -j$(nproc) install
RUN wget https://github.com/Unidata/netcdf-c/archive/refs/tags/v4.9.0.tar.gz && \
    tar xvzf v4.9.0.tar.gz && \
    cd netcdf-c-4.9.0 && \
    ./configure --prefix=/usr && \
    make -j$(nproc) install
COPY test.cpp /
COPY alldatatypes.nc /
RUN g++ /test.cpp -lnetcdf -lpthread -o /usr/bin/test
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --fix-missing --no-install-recommends valgrind

test.cpp:

#include <thread>
#include <mutex>
#include <chrono>

#include "netcdf.h"

std::mutex oMutex;

int main(int argc, char* argv[])
{
    for (int i = 0; i < 1000; i++) {
        std::thread t1([argv] {
            int cdfid;
            printf("T1 begin\n");
            {
                std::lock_guard<std::mutex> oLock(oMutex);
                nc_open(argv[1], NC_NOWRITE, &cdfid);
                printf("T1 open %d\n", cdfid);
            }
            std::this_thread::sleep_for(std::chrono::microseconds(1));
            {
                std::lock_guard<std::mutex> oLock(oMutex);
                nc_close(cdfid);
                printf("T1 close %d\n", cdfid);
            }
            printf("T1 end\n");
        });
        std::thread t2([argv] {
            int cdfid2;
            printf("T2 begin\n");
            {
                std::lock_guard<std::mutex> oLock(oMutex);
                nc_open(argv[1], NC_NOWRITE, &cdfid2);
                printf("T2 open %d\n", cdfid2);
            }
            std::this_thread::sleep_for(std::chrono::microseconds(1));
            {
                int nVarId = 0;
                std::lock_guard<std::mutex> oLock(oMutex);
                nc_inq_varid(cdfid2, "string_var", &nVarId);
                char szName[NC_MAX_NAME + 1] = {};
                printf("T2 begin nc_inq_varname(%d, %d)\n", cdfid2, nVarId);
                nc_inq_varname(cdfid2, nVarId, szName);
                printf("T2 end nc_inq_varname(%d, %d)\n", cdfid2, nVarId);
            }
            {
                std::lock_guard<std::mutex> oLock(oMutex);
                nc_close(cdfid2);
                printf("T2 close %d\n", cdfid2);
            }
            printf("T2 end\n");

        });
        t1.join();
        t2.join();
    }
    printf("success!\n");
    return 0;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions