@@ -146,32 +146,102 @@ def parseJsonFile = { File file, JsonSlurper slurper, int maxDocs ->
146146 }
147147 }
148148
149+ if (firstChar == null ) {
150+ throw new RuntimeException (" JSON file is empty" )
151+ }
152+
149153 if (firstChar == (char ) ' [' ) {
150- // JSON array format
151154 println " Detected: JSON array format"
152- def parsed = slurper. parse(file)
153- if (parsed instanceof List ) {
154- documents = ((List ) parsed). take(maxDocs)
155- } else {
156- documents = [parsed]
157- }
158155 } else if (firstChar == (char ) ' {' ) {
159- // Newline-delimited JSON (NDJSON) format
160- println " Detected: Newline-delimited JSON (NDJSON) format"
161- file. eachLine(' UTF-8' ) { line ->
162- if (documents. size() < maxDocs) {
163- String trimmed = line. trim()
164- if (trimmed && trimmed. startsWith(' {' )) {
165- try {
166- documents. add(slurper. parseText(trimmed))
167- } catch (Exception e) {
168- println " Warning: Failed to parse line: ${ e.message.take(50)} "
156+ println " Detected: JSON object(s) format"
157+ } else {
158+ throw new RuntimeException (
159+ " Unrecognized JSON format. File should start with '[' or '{'" )
160+ }
161+
162+ /**
163+ * Extract complete JSON objects by tracking brace depth.
164+ * Handles:
165+ * - Strings with escaped quotes, braces, and newlines
166+ * - Nested objects and arrays
167+ * - Pretty-printed / multiline JSON
168+ * - NDJSON
169+ * - JSON arrays of objects
170+ */
171+ file. withReader(' UTF-8' ) { reader ->
172+ StringBuilder current = new StringBuilder ()
173+ int depth = 0
174+ boolean inString = false
175+ boolean escaped = false
176+ boolean foundFirstBrace = false
177+
178+ int ch
179+ while ((ch = reader. read()) != -1 && documents. size() < maxDocs) {
180+ char c = (char ) ch
181+
182+ // Handle string escaping
183+ if (escaped) {
184+ current. append(c)
185+ escaped = false
186+ continue
187+ }
188+
189+ if (c == (char ) ' \\ ' && inString) {
190+ current. append(c)
191+ escaped = true
192+ continue
193+ }
194+
195+ if (c == (char ) ' "' ) {
196+ inString = ! inString
197+ if (foundFirstBrace) {
198+ current. append(c)
199+ }
200+ continue
201+ }
202+
203+ // Skip everything outside strings and outside objects
204+ if (inString) {
205+ current. append(c)
206+ continue
207+ }
208+
209+ // We're outside a string
210+ if (c == (char ) ' {' ) {
211+ if (depth == 0 ) {
212+ // Starting a new top-level object
213+ foundFirstBrace = true
214+ current. setLength(0 )
215+ }
216+ current. append(c)
217+ depth++
218+ } else if (c == (char ) ' }' ) {
219+ depth--
220+ current. append(c)
221+
222+ if (depth == 0 && foundFirstBrace) {
223+ // Completed a top-level object
224+ String jsonStr = current. toString(). trim()
225+ if (jsonStr) {
226+ try {
227+ documents. add(slurper. parseText(jsonStr))
228+ } catch (Exception e) {
229+ println " Warning: Failed to parse object: ${ e.message.take(80)} "
230+ }
169231 }
232+ current. setLength(0 )
233+ foundFirstBrace = false
170234 }
235+ } else if (foundFirstBrace) {
236+ current. append(c)
171237 }
238+ // Characters outside an object (commas between array elements,
239+ // brackets, whitespace) are silently skipped
172240 }
173- } else {
174- throw new RuntimeException (" Unrecognized JSON format. File should start with '[' (array) or '{' (NDJSON)" )
241+ }
242+
243+ if (documents. isEmpty()) {
244+ throw new RuntimeException (" No valid JSON objects found in file" )
175245 }
176246
177247 println " Parsed ${ documents.size()} document(s)"
@@ -259,7 +329,7 @@ def toFieldName = { String name, String idFldName ->
259329 def first = parts[0].toLowerCase()
260330 def rest = parts.drop(1).collect { it.capitalize() }.join('')
261331 return first + rest*/
262- return name;
332+ return name
263333}
264334
265335/**
0 commit comments