Skip to content

Commit 08d0296

Browse files
authored
fix: Zip CSV uploads not working (#1457)
Fixes #1080. Fixes #1416 Updates jszip and uses the internal stream helper instead of nodestream which we would need a polyfill for. The progress on zip uploads is a bit odd since we use the unzip progress. It seems papaparse loads more from the stream while processing/before processing and can finish reading the zip before it's done processing chunks. Also, uploading the tables seems to be a blocking operation. Not sure if that's an easy fix, but after the 50% mark on uploading a large zip, there are some blocks on the main thread (marching ants freezes as an indicator). There is a ~2GB limit for JSZip it seems Stuk/jszip#777
1 parent 6ff27a6 commit 08d0296

7 files changed

Lines changed: 103 additions & 65 deletions

File tree

package-lock.json

Lines changed: 18 additions & 31 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@
8282
"@types/eslint": "^8.4.10",
8383
"@types/jest": "^29.2.5",
8484
"@types/jquery": "^3.5.14",
85-
"@types/jszip": "3.1.7",
8685
"@types/lodash": "^4.14.182",
8786
"@types/lodash.clamp": "^4.0.6",
8887
"@types/lodash.debounce": "^4.0.6",

packages/code-studio/package.json

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
"@fortawesome/react-fontawesome": "^0.2.0",
4040
"classnames": "^2.3.1",
4141
"event-target-shim": "^6.0.2",
42-
"jszip": "3.2.2",
42+
"jszip": "3.10.1",
4343
"lodash.debounce": "^4.0.8",
4444
"lodash.throttle": "^4.1.1",
4545
"memoize-one": "^5.1.1",
@@ -58,10 +58,6 @@
5858
"redux-thunk": "^2.4.1",
5959
"shortid": "^2.2.16"
6060
},
61-
"dependenciesComments": {
62-
"@types/jszip": "3.1.7 is the closest to 3.2.2 available. JSZip adds official typings in 3.4.0, so remove if going past JSZip 3.4.0",
63-
"jszip": "Pinned to 3.2.2 b/c 3.3.0+ breaks nodestream usage. Not fixed as of 3.5.0. https://github.com/Stuk/jszip/issues/663"
64-
},
6561
"homepage": ".",
6662
"main": "build/index.html",
6763
"files": [

packages/console/src/csv/CsvInputBar.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ class CsvInputBar extends Component<CsvInputBarProps, CsvInputBarState> {
178178
}
179179
}
180180

181-
handleFile(file: Blob | JSZipObject, isZip = false): void {
181+
handleFile(file: File | Blob | JSZipObject, isZip = false): void {
182182
log.info(
183183
`Starting CSV parser for ${
184184
file instanceof File ? file.name : 'pasted values'

packages/console/src/csv/CsvParser.ts

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import type { IdeSession, Table } from '@deephaven/jsapi-types';
55
import type { JSZipObject } from 'jszip';
66
import CsvTypeParser from './CsvTypeParser';
77
import { CsvTypes } from './CsvFormats';
8+
import makeZipStreamHelper from './ZipStreamHelper';
89

910
const log = Log.module('CsvParser');
1011

@@ -15,7 +16,7 @@ const ZIP_CONSOLIDATE_CHUNKS = 650;
1516
interface CsvParserConstructor {
1617
onFileCompleted: (tables: Table[]) => void;
1718
session: IdeSession;
18-
file: Blob | JSZipObject;
19+
file: File | Blob | JSZipObject;
1920
type: CsvTypes;
2021
readHeaders: boolean;
2122
onProgress: (progressValue: number) => boolean;
@@ -70,6 +71,8 @@ class CsvParser {
7071
this.onProgress = onProgress;
7172
this.onError = onError;
7273
this.tables = [];
74+
this.rowCount = 0;
75+
this.rowsProcessed = 0;
7376
this.chunks = 0;
7477
this.totalChunks = isZip
7578
? 0
@@ -102,7 +105,7 @@ class CsvParser {
102105

103106
session: IdeSession;
104107

105-
file: Blob | JSZipObject;
108+
file: File | Blob | JSZipObject;
106109

107110
isZip: boolean;
108111

@@ -122,6 +125,10 @@ class CsvParser {
122125

123126
types?: string[];
124127

128+
rowCount: number;
129+
130+
rowsProcessed: number;
131+
125132
chunks: number;
126133

127134
totalChunks: number;
@@ -167,17 +174,20 @@ class CsvParser {
167174
}
168175

169176
parse(): void {
170-
const handleParseDone = (types: string[]) => {
171-
const toParse = this.isZip
172-
? (this.file as JSZipObject).nodeStream(
173-
// JsZip types are incorrect, thus the funny casting
174-
// Actual parameter is 'nodebuffer'
175-
'nodebuffer' as 'nodestream',
176-
this.handleNodeUpdate
177-
)
178-
: (this.file as Blob);
177+
const handleParseDone = (types: string[], rowCount: number) => {
179178
this.types = types;
180-
Papa.parse(toParse, this.config);
179+
this.rowCount = rowCount;
180+
181+
if (this.file instanceof File || this.file instanceof Blob) {
182+
Papa.parse(this.file, this.config);
183+
} else {
184+
const zipStream = makeZipStreamHelper(this.file, this.handleNodeUpdate);
185+
// This is actually a stream, but papaparse TS doesn't like it
186+
Papa.parse(zipStream as unknown as Blob, this.config);
187+
// The stream needs to be manually resumed since jszip starts paused
188+
// Papaparse does not call resume and assumes the stream is already reading
189+
zipStream.resume();
190+
}
181191
};
182192
const typeParser = new CsvTypeParser(
183193
handleParseDone,
@@ -274,6 +284,9 @@ class CsvParser {
274284
}
275285
assertNotNull(this.headers);
276286
assertNotNull(types);
287+
288+
this.rowsProcessed += columns[0].length;
289+
277290
session
278291
.newTable(this.headers, types, columns, this.timeZone)
279292
.then(table => {
@@ -294,7 +307,9 @@ class CsvParser {
294307
if (totalChunks > 0) {
295308
progress = Math.round((tables.length / totalChunks) * 50) + 50;
296309
} else {
297-
progress = Math.round(50 + this.zipProgress / 2);
310+
// The zip file can be read entirely while in the middle of parsing
311+
// Since we know the number of rows from the type parsing, use that for progress
312+
progress = Math.round((this.rowsProcessed / this.rowCount) * 50) + 50;
298313
}
299314
log.debug2(`CSV parser progress ${progress}`);
300315
onProgress(progress);

packages/console/src/csv/CsvTypeParser.ts

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import Papa, { Parser, ParseResult, ParseLocalConfig } from 'papaparse';
44
// Intentionally using isNaN rather than Number.isNaN
55
/* eslint-disable no-restricted-globals */
66
import NewTableColumnTypes from './NewTableColumnTypes';
7+
import makeZipStreamHelper from './ZipStreamHelper';
78

89
// Initially column types start as unknown
910
const UNKNOWN = 'unknown';
@@ -156,8 +157,8 @@ class CsvTypeParser {
156157
}
157158

158159
constructor(
159-
onFileCompleted: (types: string[]) => void,
160-
file: Blob | JSZipObject,
160+
onFileCompleted: (types: string[], rowCount: number) => void,
161+
file: File | Blob | JSZipObject,
161162
readHeaders: boolean,
162163
parentConfig: ParseLocalConfig<unknown, Blob | NodeJS.ReadableStream>,
163164
nullString: string | null,
@@ -175,6 +176,7 @@ class CsvTypeParser {
175176
this.onError = onError;
176177
this.chunks = 0;
177178
this.totalChunks = totalChunks;
179+
this.rowCount = 0;
178180
this.isZip = isZip;
179181
this.shouldTrim = shouldTrim;
180182
this.zipProgress = 0;
@@ -192,9 +194,9 @@ class CsvTypeParser {
192194
};
193195
}
194196

195-
onFileCompleted: (types: string[]) => void;
197+
onFileCompleted: (types: string[], rowCount: number) => void;
196198

197-
file: Blob | JSZipObject;
199+
file: File | Blob | JSZipObject;
198200

199201
readHeaders: boolean;
200202

@@ -210,6 +212,8 @@ class CsvTypeParser {
210212

211213
totalChunks: number;
212214

215+
rowCount: number;
216+
213217
isZip: boolean;
214218

215219
shouldTrim: boolean;
@@ -219,15 +223,16 @@ class CsvTypeParser {
219223
config: ParseLocalConfig<unknown, Blob | NodeJS.ReadableStream>;
220224

221225
parse(): void {
222-
const toParse = this.isZip
223-
? (this.file as JSZipObject).nodeStream(
224-
// JsZip types are incorrect, thus the funny casting
225-
// Actual parameter is 'nodebuffer'
226-
'nodebuffer' as 'nodestream',
227-
this.handleNodeUpdate
228-
)
229-
: (this.file as Blob);
230-
Papa.parse(toParse, this.config);
226+
if (this.file instanceof File || this.file instanceof Blob) {
227+
Papa.parse(this.file, this.config);
228+
} else {
229+
const zipStream = makeZipStreamHelper(this.file, this.handleNodeUpdate);
230+
// This is actually a stream, but papaparse TS doesn't like it
231+
Papa.parse(zipStream as unknown as Blob, this.config);
232+
// The stream needs to be manually resumed since jszip starts paused
233+
// Papaparse does not call resume and assumes the stream is already reading
234+
zipStream.resume();
235+
}
231236
}
232237

233238
handleChunk(result: ParseResult<string[]>, parser: Parser): void {
@@ -245,6 +250,8 @@ class CsvTypeParser {
245250
}
246251
}
247252

253+
this.rowCount += data.length;
254+
248255
assertNotNull(this.types);
249256

250257
const cloneTypes = [...this.types];
@@ -294,7 +301,8 @@ class CsvTypeParser {
294301
type === UNKNOWN || type === NewTableColumnTypes.LOCAL_TIME
295302
? NewTableColumnTypes.STRING
296303
: type
297-
)
304+
),
305+
this.rowCount
298306
);
299307
}
300308
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import type { JSZipObject, OnUpdateCallback, JSZipStreamHelper } from 'jszip';
2+
3+
/**
4+
* This is used to help papaparse understand our stream.
5+
* It uses these fields for feature detection, but never actually calls read()
6+
* https://github.com/mholt/PapaParse/blob/master/papaparse.js#L244
7+
*/
8+
interface ZipStreamHelper extends JSZipStreamHelper<string> {
9+
readable: boolean;
10+
read(): void;
11+
removeListener(): void;
12+
}
13+
14+
export default function makeZipStreamHelper(
15+
zipObj: JSZipObject,
16+
onUpdate: OnUpdateCallback
17+
) {
18+
const helper: ZipStreamHelper = (
19+
zipObj as JSZipObject & {
20+
// The type could be anything except nodebuffer from https://stuk.github.io/jszip/documentation/api_zipobject/internal_stream.html
21+
// We only need it as a string though
22+
// JSZip types don't include this method for some reason
23+
internalStream(type: 'string'): JSZipStreamHelper<string>;
24+
}
25+
).internalStream('string') as ZipStreamHelper;
26+
27+
helper.readable = true;
28+
helper.read = () => false;
29+
helper.removeListener = () => false;
30+
helper.on('data', (_, metadata) => onUpdate(metadata));
31+
32+
return helper;
33+
}

0 commit comments

Comments
 (0)