dataverse/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java at aa02d3451035fb1fd2f2465b8d236628270a4fd7 · IQSS/dataverse · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
   Copyright (C) 2005-2012, by the President and Fellows of Harvard College.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

         http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

   Dataverse Network - A web application to share, preserve and analyze research data.
   Developed at the Institute for Quantitative Social Science, Harvard University.
   Version 3.0.
*/

package edu.harvard.iq.dataverse.util;


import edu.harvard.iq.dataverse.DataFile;
import edu.harvard.iq.dataverse.DataFile.ChecksumType;
import edu.harvard.iq.dataverse.DataFileServiceBean;
import edu.harvard.iq.dataverse.Dataset;
import edu.harvard.iq.dataverse.DatasetVersion;
import edu.harvard.iq.dataverse.FileMetadata;
import edu.harvard.iq.dataverse.TermsOfUseAndAccess;
import edu.harvard.iq.dataverse.dataaccess.DataAccess;
import edu.harvard.iq.dataverse.dataaccess.ImageThumbConverter;
import edu.harvard.iq.dataverse.dataaccess.S3AccessIO;
import edu.harvard.iq.dataverse.dataset.DatasetThumbnail;
import edu.harvard.iq.dataverse.datasetutility.FileExceedsMaxSizeException;
import static edu.harvard.iq.dataverse.datasetutility.FileSizeChecker.bytesToHumanReadable;
import edu.harvard.iq.dataverse.ingest.IngestReport;
import edu.harvard.iq.dataverse.ingest.IngestServiceBean;
import edu.harvard.iq.dataverse.ingest.IngestServiceShapefileHelper;
import edu.harvard.iq.dataverse.ingest.IngestableDataChecker;
import java.awt.image.BufferedImage;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.channels.FileChannel;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.sql.Timestamp;
import java.text.MessageFormat;
import java.text.SimpleDateFormat;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.UUID;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.activation.MimetypesFileTypeMap;
import javax.ejb.EJBException;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.apache.commons.io.FileUtils;

import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.io.FilenameUtils;

import com.amazonaws.AmazonServiceException;
import edu.harvard.iq.dataverse.dataaccess.DataAccessOption;
import edu.harvard.iq.dataverse.dataaccess.StorageIO;
import java.util.Arrays;
import org.apache.commons.io.IOUtils;

/**
 * a 4.0 implementation of the DVN FileUtil;
 * it provides some of the functionality from the 3.6 implementation,
 * but the old code is ported creatively on the method-by-method basis.
 *
 * @author Leonid Andreev
 */
public class FileUtil implements java.io.Serializable  {
    private static final Logger logger = Logger.getLogger(FileUtil.class.getCanonicalName());

    private static final String[] TABULAR_DATA_FORMAT_SET = {"POR", "SAV", "DTA", "RDA"};

    private static Map<String, String> STATISTICAL_FILE_EXTENSION = new HashMap<String, String>();

    /*
     * The following are Stata, SAS and SPSS syntax/control cards:
     * These are recognized as text files (because they are!) so
     * we check all the uploaded "text/plain" files for these extensions, and
     * assign the following types when they are matched;
     * Note that these types are only used in the metadata displayed on the
     * dataset page. We don't support ingest on control cards.
     * -- L.A. 4.0 Oct. 2014
    */

    static {
        STATISTICAL_FILE_EXTENSION.put("do",  "application/x-stata-syntax");
        STATISTICAL_FILE_EXTENSION.put("sas", "application/x-sas-syntax");
        STATISTICAL_FILE_EXTENSION.put("sps", "application/x-spss-syntax");
        STATISTICAL_FILE_EXTENSION.put("csv", "text/csv");
        STATISTICAL_FILE_EXTENSION.put("tsv", "text/tsv");
    }

    private static MimetypesFileTypeMap MIME_TYPE_MAP = new MimetypesFileTypeMap();

    public static final String MIME_TYPE_STATA   = "application/x-stata";
    public static final String MIME_TYPE_STATA13 = "application/x-stata-13";
    public static final String MIME_TYPE_STATA14 = "application/x-stata-14";
    public static final String MIME_TYPE_STATA15 = "application/x-stata-15";
    public static final String MIME_TYPE_RDATA   = "application/x-rlang-transport";

    public static final String MIME_TYPE_CSV     = "text/csv";
    public static final String MIME_TYPE_CSV_ALT = "text/comma-separated-values";
    public static final String MIME_TYPE_TSV     = "text/tsv";
    public static final String MIME_TYPE_TSV_ALT = "text/tab-separated-values";
    public static final String MIME_TYPE_XLSX    = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";

    public static final String MIME_TYPE_SPSS_SAV = "application/x-spss-sav";
    public static final String MIME_TYPE_SPSS_POR = "application/x-spss-por";


    public static final String MIME_TYPE_FITS  = "application/fits";

    public static final String MIME_TYPE_ZIP   = "application/zip";

    public static final String MIME_TYPE_FITSIMAGE = "image/fits";
    // SHAPE file type:
    // this is the only supported file type in the GEO DATA class:

    public static final String MIME_TYPE_GEO_SHAPE = "application/zipped-shapefile";

    public static final String MIME_TYPE_UNDETERMINED_DEFAULT = "application/octet-stream";
    public static final String MIME_TYPE_UNDETERMINED_BINARY = "application/binary";

    public static final String SAVED_ORIGINAL_FILENAME_EXTENSION = "orig";

    public static final String MIME_TYPE_INGESTED_FILE = "text/tab-separated-values";

    // File type "thumbnail classes" tags:

    public static final String FILE_THUMBNAIL_CLASS_AUDIO = "audio";
    public static final String FILE_THUMBNAIL_CLASS_CODE = "code";
    public static final String FILE_THUMBNAIL_CLASS_DOCUMENT = "document";
    public static final String FILE_THUMBNAIL_CLASS_ASTRO = "astro";
    public static final String FILE_THUMBNAIL_CLASS_IMAGE = "image";
    public static final String FILE_THUMBNAIL_CLASS_NETWORK = "network";
    public static final String FILE_THUMBNAIL_CLASS_GEOSHAPE = "geodata";
    public static final String FILE_THUMBNAIL_CLASS_TABULAR = "tabular";
    public static final String FILE_THUMBNAIL_CLASS_VIDEO = "video";
    public static final String FILE_THUMBNAIL_CLASS_PACKAGE = "package";
    public static final String FILE_THUMBNAIL_CLASS_OTHER = "other";

    // File type facets, as returned by the getFacetFileType() method in this utility:

    private static final String FILE_FACET_CLASS_ARCHIVE = "Archive";
    private static final String FILE_FACET_CLASS_AUDIO = "Audio";
    private static final String FILE_FACET_CLASS_CODE = "Code";
    private static final String FILE_FACET_CLASS_DATA = "Data";
    private static final String FILE_FACET_CLASS_DOCUMENT = "Document";
    private static final String FILE_FACET_CLASS_ASTRO = "FITS";
    private static final String FILE_FACET_CLASS_IMAGE = "Image";
    private static final String FILE_FACET_CLASS_NETWORK = "Network Data";
    private static final String FILE_FACET_CLASS_GEOSHAPE = "Shape";
    private static final String FILE_FACET_CLASS_TABULAR = "Tabular Data";
    private static final String FILE_FACET_CLASS_VIDEO = "Video";
    private static final String FILE_FACET_CLASS_TEXT = "Text";
    private static final String FILE_FACET_CLASS_OTHER = "Other";
    private static final String FILE_FACET_CLASS_UNKNOWN = "Unknown";

    // The file type facets and type-specific thumbnail classes (above) are
    // very similar, but not exactly 1:1; so the following map is for
    // maintaining the relationship between the two:

    public static Map<String, String> FILE_THUMBNAIL_CLASSES = new HashMap<String, String>();

    static {
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_VIDEO, FILE_THUMBNAIL_CLASS_VIDEO);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_AUDIO, FILE_THUMBNAIL_CLASS_AUDIO);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_CODE, FILE_THUMBNAIL_CLASS_CODE);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_DATA, FILE_THUMBNAIL_CLASS_TABULAR);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_NETWORK, FILE_THUMBNAIL_CLASS_NETWORK);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_ASTRO, FILE_THUMBNAIL_CLASS_ASTRO);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_IMAGE, FILE_THUMBNAIL_CLASS_IMAGE);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_DOCUMENT, FILE_THUMBNAIL_CLASS_DOCUMENT);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_GEOSHAPE, FILE_THUMBNAIL_CLASS_GEOSHAPE);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_TABULAR, FILE_THUMBNAIL_CLASS_TABULAR);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_TEXT, FILE_THUMBNAIL_CLASS_DOCUMENT);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_OTHER, FILE_THUMBNAIL_CLASS_OTHER);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_UNKNOWN, FILE_THUMBNAIL_CLASS_OTHER);
        FILE_THUMBNAIL_CLASSES.put(FILE_FACET_CLASS_ARCHIVE, FILE_THUMBNAIL_CLASS_PACKAGE);
    }

    /**
     * This string can be prepended to a Base64-encoded representation of a PNG
     * file in order to imbed an image directly into an HTML page using the
     * "img" tag. See also https://en.wikipedia.org/wiki/Data_URI_scheme
     */
    public static String DATA_URI_SCHEME = "data:image/png;base64,";

    public FileUtil() {
    }

    public static void copyFile(File inputFile, File outputFile) throws IOException {
        FileChannel in = null;
        WritableByteChannel out = null;

        try {
            in = new FileInputStream(inputFile).getChannel();
            out = new FileOutputStream(outputFile).getChannel();
            long bytesPerIteration = 50000;
            long start = 0;
            while ( start < in.size() ) {
                in.transferTo(start, bytesPerIteration, out);
                start += bytesPerIteration;
            }

        } finally {
            if (in != null) { in.close(); }
            if (out != null) { out.close(); }
        }
    }


    public static String getFileExtension(String fileName){
        String ext = null;
        if ( fileName.lastIndexOf(".") != -1){
            ext = (fileName.substring( fileName.lastIndexOf(".") + 1 )).toLowerCase();
        }
        return ext;
    }

    public static String replaceExtension(String originalName) {
       return replaceExtension(originalName, "tab");
    }

    public static String replaceExtension(String originalName, String newExtension) {
        int extensionIndex = originalName.lastIndexOf(".");
        if (extensionIndex != -1 ) {
            return originalName.substring(0, extensionIndex) + "."+newExtension ;
        } else {
            return originalName +"."+newExtension ;
        }
    }

    public static String getUserFriendlyFileType(DataFile dataFile) {
        String fileType = dataFile.getContentType();

        if (fileType != null) {
            if (fileType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE)){
                return ShapefileHandler.SHAPEFILE_FILE_TYPE_FRIENDLY_NAME;
            }
            if (fileType.contains(";")) {
                fileType = fileType.substring(0, fileType.indexOf(";"));
            }
            try {
                return BundleUtil.getStringFromPropertyFile(fileType,"MimeTypeDisplay" );
            } catch (MissingResourceException e) {
                return fileType;
            }
        }

        return fileType;
    }

    public static String getIndexableFacetFileType(DataFile dataFile) {
        String fileType = getFileType(dataFile);
        try {
            return BundleUtil.getStringFromDefaultPropertyFile(fileType,"MimeTypeFacets"  );
        } catch (MissingResourceException ex) {
            // if there's no defined "facet-friendly" form of this mime type
            // we'll truncate the available type by "/", e.g., all the
            // unknown image/* types will become "image".
            // Since many other, quite different types would then all become
            // "application" - we will use the facet "Other" for all the
            // application/* types not specifically defined in the properties file.
            //
            // UPDATE, MH 4.9.2
            // Since production is displaying both "tabulardata" and "Tabular Data"
            // we are going to try to add capitalization here to this function
            // in order to capitalize all the unknown types that are not called
            // out in MimeTypeFacets.properties

            if (!StringUtil.isEmpty(fileType)) {
                String typeClass = fileType.split("/")[0];
                if ("application".equalsIgnoreCase(typeClass)) {
                    return FILE_FACET_CLASS_OTHER;
                }

                return Character.toUpperCase(typeClass.charAt(0)) + typeClass.substring(1);
            } else {
                return null;
            }
        }
    }

    public static String getFileType(DataFile dataFile) {
        String fileType = dataFile.getContentType();

        if (!StringUtil.isEmpty(fileType)) {
            if (fileType.contains(";")) {
                fileType = fileType.substring(0, fileType.indexOf(";"));
            }
            return fileType;
        } else {
            return "application/octet-stream";
        }

    }

    public static String getFacetFileType(DataFile dataFile) {
        String fileType = getFileType(dataFile);
        try {
            return BundleUtil.getStringFromPropertyFile(fileType,"MimeTypeFacets"  );
        } catch (MissingResourceException ex) {
            // if there's no defined "facet-friendly" form of this mime type
            // we'll truncate the available type by "/", e.g., all the
            // unknown image/* types will become "image".
            // Since many other, quite different types would then all become
            // "application" - we will use the facet "Other" for all the
            // application/* types not specifically defined in the properties file.
            //
            // UPDATE, MH 4.9.2
            // Since production is displaying both "tabulardata" and "Tabular Data"
            // we are going to try to add capitalization here to this function
            // in order to capitalize all the unknown types that are not called
            // out in MimeTypeFacets.properties

            if (!StringUtil.isEmpty(fileType)) {
                String typeClass = fileType.split("/")[0];
                if ("application".equalsIgnoreCase(typeClass)) {
                    return FILE_FACET_CLASS_OTHER;
                }

                return Character.toUpperCase(typeClass.charAt(0)) + typeClass.substring(1);
            }
            else
            {
                return  null;
            }
        }
    }

    public static String getUserFriendlyOriginalType(DataFile dataFile) {
        if (!dataFile.isTabularData()) {
            return null;
        }

        String fileType = dataFile.getOriginalFileFormat();

        if (fileType != null && !fileType.equals("")) {
            if (fileType.contains(";")) {
                fileType = fileType.substring(0, fileType.indexOf(";"));
            }
            try {
                return BundleUtil.getStringFromPropertyFile(fileType,"MimeTypeDisplay" );
            } catch (MissingResourceException e) {
                return fileType;
            }
        }

        return "UNKNOWN";
    }

    /**
     *  Returns a content type string for a FileObject
     *
     */
    private static String determineContentType(File fileObject) {
        if (fileObject==null){
            return null;
        }
        String contentType;
        try {
            contentType = determineFileType(fileObject, fileObject.getName());
        } catch (Exception ex) {
            logger.warning("FileUtil.determineFileType failed for file with name: " + fileObject.getName());
            contentType = null;
        }

       if ((contentType==null)||(contentType.equals(""))){
            contentType = MIME_TYPE_UNDETERMINED_DEFAULT;
       }
       return contentType;

    }

    public static String retestIngestableFileType(File file, String fileType) {
        IngestableDataChecker tabChecker = new IngestableDataChecker(TABULAR_DATA_FORMAT_SET);
        String newType = tabChecker.detectTabularDataFormat(file);

        return newType != null ? newType : fileType;
    }

    public static String determineFileType(File f, String fileName) throws IOException{
        String fileType = null;
        String fileExtension = getFileExtension(fileName);


        // step 1:
        // Apply our custom methods to try and recognize data files that can be
        // converted to tabular data, or can be parsed for extra metadata
        // (such as FITS).
        logger.fine("Attempting to identify potential tabular data files;");
        IngestableDataChecker tabChk = new IngestableDataChecker(TABULAR_DATA_FORMAT_SET);

        fileType = tabChk.detectTabularDataFormat(f);

        logger.fine("determineFileType: tabular data checker found "+fileType);

        // step 2: If not found, check if graphml or FITS
        if (fileType==null) {
            if (isGraphMLFile(f))  {
                fileType = "text/xml-graphml";
            } else // Check for FITS:
            // our check is fairly weak (it appears to be hard to really
            // really recognize a FITS file without reading the entire
            // stream...), so in version 3.* we used to nsist on *both*
            // the ".fits" extension and the header check;
            // in 4.0, we'll accept either the extension, or the valid
            // magic header:
            if (isFITSFile(f) || (fileExtension != null
                    && fileExtension.equalsIgnoreCase("fits"))) {
                fileType = "application/fits";
            }
        }

        // step 3: check the mime type of this file with Jhove
        if (fileType == null){
            JhoveFileType jw = new JhoveFileType();
            String mimeType = jw.getFileMimeType(f);
            if (mimeType != null) {
                fileType = mimeType;
            }
        }

        // step 4:
        // Additional processing; if we haven't gotten much useful information
        // back from Jhove, we'll try and make an educated guess based on
        // the file extension:

        if ( fileExtension != null) {
            logger.fine("fileExtension="+fileExtension);

            if (fileType == null || fileType.startsWith("text/plain") || "application/octet-stream".equals(fileType)) {
                if (fileType != null && fileType.startsWith("text/plain") && STATISTICAL_FILE_EXTENSION.containsKey(fileExtension)) {
                    fileType = STATISTICAL_FILE_EXTENSION.get(fileExtension);
                } else {
                    fileType = determineFileTypeByExtension(fileName);
                }

                logger.fine("mime type recognized by extension: "+fileType);
            }
        } else {
            logger.fine("fileExtension is null");
        }

        // step 5:
        // if this is a compressed file - zip or gzip - we'll check the
        // file(s) inside the compressed stream and see if it's one of our
        // recognized formats that we want to support compressed:

        if ("application/x-gzip".equals(fileType)) {
            logger.fine("we'll run additional checks on this gzipped file.");
            // We want to be able to support gzipped FITS files, same way as
            // if they were just regular FITS files:
            FileInputStream gzippedIn = new FileInputStream(f);
            // (new FileInputStream() can throw a "filen not found" exception;
            // however, if we've made it this far, it really means that the
            // file does exist and can be opened)
            InputStream uncompressedIn = null;
            try {
                uncompressedIn = new GZIPInputStream(gzippedIn);
                if (isFITSFile(uncompressedIn)) {
                    fileType = "application/fits-gzipped";
                }
            } catch (IOException ioex) {
                if (uncompressedIn != null) {
                    try {uncompressedIn.close();} catch (IOException e) {}
                }
            }
        }
        if ("application/zip".equals(fileType)) {

            // Is this a zipped Shapefile?
            // Check for shapefile extensions as described here: http://en.wikipedia.org/wiki/Shapefile
            //logger.info("Checking for shapefile");

            ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(f));
             if (shp_handler.containsShapefile()){
              //  logger.info("------- shapefile FOUND ----------");
                 fileType = ShapefileHandler.SHAPEFILE_FILE_TYPE; //"application/zipped-shapefile";
             }
        }

        logger.fine("returning fileType "+fileType);
        return fileType;
    }

    public static String determineFileTypeByExtension(String fileName) {
        String mimetypesFileTypeMapResult = MIME_TYPE_MAP.getContentType(fileName);
        logger.fine("MimetypesFileTypeMap type by extension, for " + fileName + ": " + mimetypesFileTypeMapResult);
        if (mimetypesFileTypeMapResult != null) {
            if ("application/octet-stream".equals(mimetypesFileTypeMapResult)) {
                return lookupFileTypeFromPropertiesFile(fileName);
            } else {
                return mimetypesFileTypeMapResult;
            }
        } else {
            return null;
        }
    }

    public static String lookupFileTypeFromPropertiesFile(String fileName) {
        String fileExtension = FilenameUtils.getExtension(fileName);
        String propertyFileName = "MimeTypeDetectionByFileExtension";
        String propertyFileNameOnDisk = propertyFileName + ".properties";
        try {
            logger.fine("checking " + propertyFileNameOnDisk + " for file extension " + fileExtension);
            return BundleUtil.getStringFromPropertyFile(fileExtension, propertyFileName);
        } catch (MissingResourceException ex) {
            logger.info(fileExtension + " is a file extension Dataverse doesn't know about. Consider adding it to the " + propertyFileNameOnDisk + " file.");
            return null;
        }
    }

    /*
     * Custom method for identifying FITS files:
     * TODO:
     * the existing check for the "magic header" is very weak (see below);
     * it should probably be replaced by attempting to parse and read at
     * least the primary HDU, using the NOM fits parser.
     * -- L.A. 4.0 alpha
    */
    private static boolean isFITSFile(File file) {
        BufferedInputStream ins = null;

        try {
            ins = new BufferedInputStream(new FileInputStream(file));
            return isFITSFile(ins);
        } catch (IOException ex) {
        }

        return false;
    }

    private static boolean isFITSFile(InputStream ins) {
        boolean isFITS = false;

        // number of header bytes read for identification:
        int magicWordLength = 6;
        String magicWord = "SIMPLE";

        try {
            byte[] b = new byte[magicWordLength];
            logger.fine("attempting to read "+magicWordLength+" bytes from the FITS format candidate stream.");
            if (ins.read(b, 0, magicWordLength) != magicWordLength) {
                throw new IOException();
            }

            if (magicWord.equals(new String(b))) {
                logger.fine("yes, this is FITS file!");
                isFITS = true;
            }
        } catch (IOException ex) {
            isFITS = false;
        } finally {
            if (ins != null) {
                try {
                    ins.close();
                } catch (Exception e) {
                }
            }
        }

        return isFITS;
    }

    private static boolean isGraphMLFile(File file) {
        boolean isGraphML = false;
        logger.fine("begin isGraphMLFile()");
        try{
            FileReader fileReader = new FileReader(file);
            javax.xml.stream.XMLInputFactory xmlif = javax.xml.stream.XMLInputFactory.newInstance();
            xmlif.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE);

            XMLStreamReader xmlr = xmlif.createXMLStreamReader(fileReader);
            for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
                if (event == XMLStreamConstants.START_ELEMENT) {
                    if (xmlr.getLocalName().equals("graphml")) {
                        String schema = xmlr.getAttributeValue("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation");
                        logger.fine("schema = "+schema);
                        if (schema!=null && schema.contains("http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd")){
                            logger.fine("graphML is true");
                            isGraphML = true;
                        }
                    }
                    break;
                }
            }
        } catch(XMLStreamException e) {
            logger.fine("XML error - this is not a valid graphML file.");
            isGraphML = false;
        } catch(IOException e) {
            throw new EJBException(e);
        }
        logger.fine("end isGraphML()");
        return isGraphML;
    }

    // from MD5Checksum.java
    public static String calculateChecksum(String datafile, ChecksumType checksumType) {

        FileInputStream fis = null;
        try {
            fis = new FileInputStream(datafile);
        } catch (FileNotFoundException ex) {
            throw new RuntimeException(ex);
        }

        return FileUtil.calculateChecksum(fis, checksumType);
    }

    // from MD5Checksum.java
    public static String calculateChecksum(InputStream in, ChecksumType checksumType) {
        MessageDigest md = null;
        try {
            // Use "SHA-1" (toString) rather than "SHA1", for example.
            md = MessageDigest.getInstance(checksumType.toString());
        } catch (NoSuchAlgorithmException e) {
            throw new RuntimeException(e);
        }

        byte[] dataBytes = new byte[1024];

        int nread;
        try {
            while ((nread = in.read(dataBytes)) != -1) {
                md.update(dataBytes, 0, nread);
            }
        } catch (IOException ex) {
            throw new RuntimeException(ex);
        } finally {
            try {
                in.close();
            } catch (Exception e) {
            }
        }

        return checksumDigestToString(md.digest());
    }

    public static String calculateChecksum(byte[] dataBytes, ChecksumType checksumType) {
        MessageDigest md = null;
        try {
            // Use "SHA-1" (toString) rather than "SHA1", for example.
            md = MessageDigest.getInstance(checksumType.toString());
        } catch (NoSuchAlgorithmException e) {
            throw new RuntimeException(e);
        }

        md.update(dataBytes);

        return checksumDigestToString(md.digest());

    }

    private static String checksumDigestToString(byte[] digestBytes) {
        StringBuilder sb = new StringBuilder("");
        for (int i = 0; i < digestBytes.length; i++) {
            sb.append(Integer.toString((digestBytes[i] & 0xff) + 0x100, 16).substring(1));
        }
        return sb.toString();
    }

    public static String generateOriginalExtension(String fileType) {

        if (fileType.equalsIgnoreCase("application/x-spss-sav")) {
            return ".sav";
        } else if (fileType.equalsIgnoreCase("application/x-spss-por")) {
            return ".por";
        } else if (fileType.equalsIgnoreCase("application/x-stata")) {
            return ".dta";
        } else if (fileType.equalsIgnoreCase( "application/x-rlang-transport")) {
            return ".RData";
        } else if (fileType.equalsIgnoreCase("text/csv")) {
            return ".csv";
        } else if (fileType.equalsIgnoreCase( "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
            return ".xlsx";
        }

        return "";
    }

    public static List<DataFile> createDataFiles(DatasetVersion version, InputStream inputStream, String fileName, String suppliedContentType, String newStorageIdentifier, String newCheckSum, SystemConfig systemConfig) throws IOException {
        List<DataFile> datafiles = new ArrayList<>();

        String warningMessage = null;

        // save the file, in the temporary location for now:
        Path tempFile = null;

        Long fileSizeLimit = systemConfig.getMaxFileUploadSizeForStore(version.getDataset().getOwner().getEffectiveStorageDriverId());
        String finalType = null;
		if (newStorageIdentifier == null) {
			if (getFilesTempDirectory() != null) {
				tempFile = Files.createTempFile(Paths.get(getFilesTempDirectory()), "tmp", "upload");
				// "temporary" location is the key here; this is why we are not using
				// the DataStore framework for this - the assumption is that
				// temp files will always be stored on the local filesystem.
				// -- L.A. Jul. 2014
				logger.fine("Will attempt to save the file as: " + tempFile.toString());
				Files.copy(inputStream, tempFile, StandardCopyOption.REPLACE_EXISTING);

				// A file size check, before we do anything else:
				// (note that "no size limit set" = "unlimited")
				// (also note, that if this is a zip file, we'll be checking
				// the size limit for each of the individual unpacked files)
				Long fileSize = tempFile.toFile().length();
				if (fileSizeLimit != null && fileSize > fileSizeLimit) {
					try {tempFile.toFile().delete();} catch (Exception ex) {}
					throw new IOException (MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.file_exceeds_limit"), bytesToHumanReadable(fileSize), bytesToHumanReadable(fileSizeLimit)));
				}

			} else {
				throw new IOException("Temp directory is not configured.");
			}
			logger.fine("mime type supplied: " + suppliedContentType);
			// Let's try our own utilities (Jhove, etc.) to determine the file type
			// of the uploaded file. (We may already have a mime type supplied for this
			// file - maybe the type that the browser recognized on upload; or, if
			// it's a harvest, maybe the remote server has already given us the type
			// for this file... with our own type utility we may or may not do better
			// than the type supplied:
			// -- L.A.
			String recognizedType = null;

			try {
				recognizedType = determineFileType(tempFile.toFile(), fileName);
				logger.fine("File utility recognized the file as " + recognizedType);
				if (recognizedType != null && !recognizedType.equals("")) {
					// is it any better than the type that was supplied to us,
					// if any?
					// This is not as trivial a task as one might expect...
					// We may need a list of "good" mime types, that should always
					// be chosen over other choices available. Maybe it should
					// even be a weighed list... as in, "application/foo" should
					// be chosen over "application/foo-with-bells-and-whistles".

					// For now the logic will be as follows:
					//
					// 1. If the contentType supplied (by the browser, most likely)
					// is some form of "unknown", we always discard it in favor of
					// whatever our own utilities have determined;
					// 2. We should NEVER trust the browser when it comes to the
					// following "ingestable" types: Stata, SPSS, R;
					// 2a. We are willing to TRUST the browser when it comes to
					// the CSV and XSLX ingestable types.
					// 3. We should ALWAYS trust our utilities when it comes to
					// ingestable types.

					if (suppliedContentType == null
                        || suppliedContentType.equals("")
						|| suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_DEFAULT)
						|| suppliedContentType.equalsIgnoreCase(MIME_TYPE_UNDETERMINED_BINARY)
						|| (canIngestAsTabular(suppliedContentType)
								&& !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV)
								&& !suppliedContentType.equalsIgnoreCase(MIME_TYPE_CSV_ALT)
								&& !suppliedContentType.equalsIgnoreCase(MIME_TYPE_XLSX))
                        || canIngestAsTabular(recognizedType)
                        || recognizedType.equals("application/fits-gzipped")
						|| recognizedType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE)
						|| recognizedType.equals(MIME_TYPE_ZIP)) {
						finalType = recognizedType;
					}
				}

			} catch (Exception ex) {
				logger.warning("Failed to run the file utility mime type check on file " + fileName);
			}

			if (finalType == null) {
				finalType = (suppliedContentType == null || suppliedContentType.equals(""))
						? MIME_TYPE_UNDETERMINED_DEFAULT
						: suppliedContentType;
			}

			// A few special cases:

			// if this is a gzipped FITS file, we'll uncompress it, and ingest it as
			// a regular FITS file:

			if (finalType.equals("application/fits-gzipped")) {

				InputStream uncompressedIn = null;
				String finalFileName = fileName;
				// if the file name had the ".gz" extension, remove it,
				// since we are going to uncompress it:
				if (fileName != null && fileName.matches(".*\\.gz$")) {
					finalFileName = fileName.replaceAll("\\.gz$", "");
				}

				DataFile datafile = null;
				try {
					uncompressedIn = new GZIPInputStream(new FileInputStream(tempFile.toFile()));
					File unZippedTempFile = saveInputStreamInTempFile(uncompressedIn, fileSizeLimit);
					datafile = createSingleDataFile(version, unZippedTempFile, finalFileName, MIME_TYPE_UNDETERMINED_DEFAULT, systemConfig.getFileFixityChecksumAlgorithm());
				} catch (IOException | FileExceedsMaxSizeException ioex) {
					datafile = null;
				} finally {
					if (uncompressedIn != null) {
                        try {uncompressedIn.close();} catch (IOException e) {}
					}
				}

				// If we were able to produce an uncompressed file, we'll use it
				// to create and return a final DataFile; if not, we're not going
				// to do anything - and then a new DataFile will be created further
				// down, from the original, uncompressed file.
				if (datafile != null) {
					// remove the compressed temp file:
					try {
						tempFile.toFile().delete();
					} catch (SecurityException ex) {
						// (this is very non-fatal)
						logger.warning("Failed to delete temporary file " + tempFile.toString());
					}

					datafiles.add(datafile);
					return datafiles;
				}

				// If it's a ZIP file, we are going to unpack it and create multiple
				// DataFile objects from its contents:
			} else if (finalType.equals("application/zip")) {

				ZipInputStream unZippedIn = null;
				ZipEntry zipEntry = null;

				int fileNumberLimit = systemConfig.getZipUploadFilesLimit();

				try {
					Charset charset = null;
					/*
                	TODO: (?)
                	We may want to investigate somehow letting the user specify
                	the charset for the filenames in the zip file...
                    - otherwise, ZipInputStream bails out if it encounteres a file
                	name that's not valid in the current charest (i.e., UTF-8, in
                    our case). It would be a bit trickier than what we're doing for
                    SPSS tabular ingests - with the lang. encoding pulldown menu -
                	because this encoding needs to be specified *before* we upload and
                    attempt to unzip the file.
                	        -- L.A. 4.0 beta12
                	logger.info("default charset is "+Charset.defaultCharset().name());
                	if (Charset.isSupported("US-ASCII")) {
                    	logger.info("charset US-ASCII is supported.");
                    	charset = Charset.forName("US-ASCII");
                    	if (charset != null) {
                       	    logger.info("was able to obtain charset for US-ASCII");
                    	}

                	 }
					 */

					if (charset != null) {
						unZippedIn = new ZipInputStream(new FileInputStream(tempFile.toFile()), charset);
					} else {
						unZippedIn = new ZipInputStream(new FileInputStream(tempFile.toFile()));
					}

					while (true) {
						try {
							zipEntry = unZippedIn.getNextEntry();
						} catch (IllegalArgumentException iaex) {
							// Note:
							// ZipInputStream documentation doesn't even mention that
							// getNextEntry() throws an IllegalArgumentException!
							// but that's what happens if the file name of the next
							// entry is not valid in the current CharSet.
							// -- L.A.
							warningMessage = "Failed to unpack Zip file. (Unknown Character Set used in a file name?) Saving the file as is.";
							logger.warning(warningMessage);
							throw new IOException();
						}

						if (zipEntry == null) {
							break;
						}
						// Note that some zip entries may be directories - we
						// simply skip them:

						if (!zipEntry.isDirectory()) {
							if (datafiles.size() > fileNumberLimit) {
								logger.warning("Zip upload - too many files.");
								warningMessage = "The number of files in the zip archive is over the limit (" + fileNumberLimit +
										"); please upload a zip archive with fewer files, if you want them to be ingested " +
										"as individual DataFiles.";
								throw new IOException();
							}

							String fileEntryName = zipEntry.getName();
							logger.fine("ZipEntry, file: " + fileEntryName);

							if (fileEntryName != null && !fileEntryName.equals("")) {

								String shortName = fileEntryName.replaceFirst("^.*[\\/]", "");

								// Check if it's a "fake" file - a zip archive entry
								// created for a MacOS X filesystem element: (these
								// start with "._")
								if (!shortName.startsWith("._") && !shortName.startsWith(".DS_Store") && !"".equals(shortName)) {
									// OK, this seems like an OK file entry - we'll try
									// to read it and create a DataFile with it:

									File unZippedTempFile = saveInputStreamInTempFile(unZippedIn, fileSizeLimit);
									DataFile datafile = createSingleDataFile(version, unZippedTempFile, null, shortName,
											MIME_TYPE_UNDETERMINED_DEFAULT,
											systemConfig.getFileFixityChecksumAlgorithm(), null, false);

									if (!fileEntryName.equals(shortName)) {
                                    	// If the filename looks like a hierarchical folder name (i.e., contains slashes and backslashes),
                                    	// we'll extract the directory name; then subject it to some "aggressive sanitizing" - strip all
                                    	// the leading, trailing and duplicate slashes; then replace all the characters that
										// don't pass our validation rules.
                                    	String directoryName = fileEntryName.replaceFirst("[\\\\/][\\\\/]*[^\\\\/]*$", "");
										directoryName = StringUtil.sanitizeFileDirectory(directoryName, true);
										// if (!"".equals(directoryName)) {
										if (!StringUtil.isEmpty(directoryName)) {
											logger.fine("setting the directory label to " + directoryName);
											datafile.getFileMetadata().setDirectoryLabel(directoryName);
										}
									}

									if (datafile != null) {
										// We have created this datafile with the mime type "unknown";
										// Now that we have it saved in a temporary location,
										// let's try and determine its real type:

                                    	String tempFileName = getFilesTempDirectory() + "/" + datafile.getStorageIdentifier();

										try {
											recognizedType = determineFileType(new File(tempFileName), shortName);
											logger.fine("File utility recognized unzipped file as " + recognizedType);
											if (recognizedType != null && !recognizedType.equals("")) {
												datafile.setContentType(recognizedType);
											}
										} catch (Exception ex) {
                                        	logger.warning("Failed to run the file utility mime type check on file " + fileName);
										}

										datafiles.add(datafile);
									}
								}
							}
						}
						unZippedIn.closeEntry();

					}

				} catch (IOException ioex) {
					// just clear the datafiles list and let
					// ingest default to creating a single DataFile out
					// of the unzipped file.
					logger.warning("Unzipping failed; rolling back to saving the file as is.");
					if (warningMessage == null) {
						warningMessage = "Failed to unzip the file. Saving the file as is.";
					}

					datafiles.clear();
				} catch (FileExceedsMaxSizeException femsx) {
                	logger.warning("One of the unzipped files exceeds the size limit; resorting to saving the file as is. " + femsx.getMessage());
					warningMessage = femsx.getMessage() + "; saving the zip file as is, unzipped.";
					datafiles.clear();
				} finally {
					if (unZippedIn != null) {
                    		try {unZippedIn.close();} catch (Exception zEx) {}
					}
				}
				if (datafiles.size() > 0) {
					// link the data files to the dataset/version: