Skip to content

Commit 5e3ce84

Browse files
fix column lineage when multiple jobs write to same dataset
Signed-off-by: Pawel Leszczynski <leszczynski.pawel@gmail.com>
1 parent 0995b0a commit 5e3ce84

13 files changed

Lines changed: 114 additions & 68 deletions

File tree

api/src/main/java/marquez/db/ColumnLineageDao.java

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,15 @@ WHERE output_dataset_field_uuid IN (<datasetFieldUuids>)
146146
output_fields.dataset_name,
147147
output_fields.field_name,
148148
output_fields.type,
149-
ARRAY_AGG(DISTINCT ARRAY[input_fields.namespace_name, input_fields.dataset_name, CAST(clr.input_dataset_version_uuid AS VARCHAR), input_fields.field_name]) AS inputFields,
150-
clr.output_dataset_version_uuid as dataset_version_uuid,
151-
clr.transformation_description,
152-
clr.transformation_type,
153-
clr.created_at,
154-
clr.updated_at
149+
ARRAY_AGG(DISTINCT ARRAY[
150+
input_fields.namespace_name,
151+
input_fields.dataset_name,
152+
CAST(clr.input_dataset_version_uuid AS VARCHAR),
153+
input_fields.field_name,
154+
clr.transformation_description,
155+
clr.transformation_type
156+
]) AS inputFields,
157+
clr.output_dataset_version_uuid as dataset_version_uuid
155158
FROM column_lineage_recursive clr
156159
INNER JOIN dataset_fields_view output_fields ON clr.output_dataset_field_uuid = output_fields.uuid -- hidden datasets will be filtered
157160
LEFT JOIN dataset_fields_view input_fields ON clr.input_dataset_field_uuid = input_fields.uuid
@@ -161,11 +164,7 @@ WHERE output_dataset_field_uuid IN (<datasetFieldUuids>)
161164
output_fields.dataset_name,
162165
output_fields.field_name,
163166
output_fields.type,
164-
clr.output_dataset_version_uuid,
165-
clr.transformation_description,
166-
clr.transformation_type,
167-
clr.created_at,
168-
clr.updated_at
167+
clr.output_dataset_version_uuid
169168
""")
170169
Set<ColumnLineageNodeData> getLineage(
171170
int depth,
@@ -193,25 +192,23 @@ dataset_fields_view AS (
193192
output_fields.dataset_name,
194193
output_fields.field_name,
195194
output_fields.type,
196-
ARRAY_AGG(DISTINCT ARRAY[input_fields.namespace_name, input_fields.dataset_name, CAST(c.input_dataset_version_uuid AS VARCHAR), input_fields.field_name]) AS inputFields,
197-
c.output_dataset_version_uuid as dataset_version_uuid,
198-
c.transformation_description,
199-
c.transformation_type,
200-
c.created_at,
201-
c.updated_at
195+
ARRAY_AGG(DISTINCT ARRAY[
196+
input_fields.namespace_name,
197+
input_fields.dataset_name,
198+
CAST(c.input_dataset_version_uuid AS VARCHAR),
199+
input_fields.field_name,
200+
c.transformation_description,
201+
c.transformation_type
202+
]) AS inputFields,
203+
null as dataset_version_uuid
202204
FROM selected_column_lineage c
203205
INNER JOIN dataset_fields_view output_fields ON c.output_dataset_field_uuid = output_fields.uuid
204206
LEFT JOIN dataset_fields_view input_fields ON c.input_dataset_field_uuid = input_fields.uuid
205207
GROUP BY
206208
output_fields.namespace_name,
207209
output_fields.dataset_name,
208210
output_fields.field_name,
209-
output_fields.type,
210-
c.output_dataset_version_uuid,
211-
c.transformation_description,
212-
c.transformation_type,
213-
c.created_at,
214-
c.updated_at
211+
output_fields.type
215212
""")
216213
/**
217214
* Each dataset is identified by a pair of strings (namespace and name). A query returns column

api/src/main/java/marquez/db/mappers/ColumnLineageNodeDataMapper.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,9 @@
55

66
package marquez.db.mappers;
77

8-
import static marquez.db.Columns.TRANSFORMATION_DESCRIPTION;
9-
import static marquez.db.Columns.TRANSFORMATION_TYPE;
108
import static marquez.db.Columns.stringOrNull;
119
import static marquez.db.Columns.stringOrThrow;
12-
import static marquez.db.Columns.uuidOrThrow;
10+
import static marquez.db.Columns.uuidOrNull;
1311

1412
import com.fasterxml.jackson.databind.ObjectMapper;
1513
import com.google.common.collect.ImmutableList;
@@ -37,11 +35,9 @@ public ColumnLineageNodeData map(ResultSet results, StatementContext ctx) throws
3735
return new ColumnLineageNodeData(
3836
stringOrThrow(results, Columns.NAMESPACE_NAME),
3937
stringOrThrow(results, Columns.DATASET_NAME),
40-
uuidOrThrow(results, Columns.DATASET_VERSION_UUID),
38+
uuidOrNull(results, Columns.DATASET_VERSION_UUID),
4139
stringOrThrow(results, Columns.FIELD_NAME),
4240
stringOrNull(results, Columns.TYPE),
43-
stringOrNull(results, TRANSFORMATION_DESCRIPTION),
44-
stringOrNull(results, TRANSFORMATION_TYPE),
4541
toInputFields(results, "inputFields"));
4642
}
4743

@@ -57,7 +53,10 @@ public static ImmutableList<InputFieldNodeData> toInputFields(ResultSet results,
5753
return ImmutableList.copyOf(
5854
Arrays.asList(deserializedArray).stream()
5955
.map(o -> (String[]) o)
60-
.map(arr -> new InputFieldNodeData(arr[0], arr[1], UUID.fromString(arr[2]), arr[3]))
56+
.map(
57+
arr ->
58+
new InputFieldNodeData(
59+
arr[0], arr[1], UUID.fromString(arr[2]), arr[3], arr[4], arr[5]))
6160
.collect(Collectors.toList()));
6261
}
6362
}

api/src/main/java/marquez/db/models/ColumnLineageNodeData.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,5 @@ public class ColumnLineageNodeData implements NodeData {
2020
@Nullable UUID datasetVersion;
2121
@NonNull String field;
2222
@Nullable String fieldType;
23-
String transformationDescription;
24-
String transformationType;
2523
@NonNull List<InputFieldNodeData> inputFields;
2624
}

api/src/main/java/marquez/db/models/InputFieldNodeData.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,6 @@ public class InputFieldNodeData {
2020
@NonNull String dataset;
2121
@Nullable UUID datasetVersion;
2222
@NonNull String field;
23+
String transformationDescription;
24+
String transformationType;
2325
}

api/src/main/java/marquez/service/ColumnLineageService.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,14 +226,16 @@ public void enrichWithColumnLineage(List<Dataset> datasets) {
226226
.add(
227227
ColumnLineage.builder()
228228
.name(nodeData.getField())
229-
.transformationDescription(nodeData.getTransformationDescription())
230-
.transformationType(nodeData.getTransformationType())
231229
.inputFields(
232230
nodeData.getInputFields().stream()
233231
.map(
234232
f ->
235233
new ColumnLineageInputField(
236-
f.getNamespace(), f.getDataset(), f.getField()))
234+
f.getNamespace(),
235+
f.getDataset(),
236+
f.getField(),
237+
f.getTransformationDescription(),
238+
f.getTransformationType()))
237239
.collect(Collectors.toList()))
238240
.build());
239241
});

api/src/main/java/marquez/service/models/ColumnLineage.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,4 @@
1919
public class ColumnLineage {
2020
@NotNull private String name;
2121
@NotNull private List<ColumnLineageInputField> inputFields;
22-
@NotNull private String transformationDescription;
23-
@NotNull private String transformationType;
2422
}

api/src/main/java/marquez/service/models/ColumnLineageInputField.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,6 @@ public class ColumnLineageInputField {
1919
@NotNull private String namespace;
2020
@NotNull private String dataset;
2121
@NotNull private String field;
22+
@NotNull private String transformationDescription;
23+
@NotNull private String transformationType;
2224
}

api/src/test/java/marquez/db/ColumnLineageDaoTest.java

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import marquez.db.models.ColumnLineageRow;
2929
import marquez.db.models.DatasetRow;
3030
import marquez.db.models.DatasetVersionRow;
31+
import marquez.db.models.InputFieldNodeData;
3132
import marquez.db.models.NamespaceRow;
3233
import marquez.db.models.SourceRow;
3334
import marquez.db.models.UpdateLineageRow;
@@ -250,8 +251,8 @@ void testGetLineage() {
250251
assertEquals("namespace", dataset_c.getInputFields().get(0).getNamespace());
251252
assertEquals("dataset_b", dataset_c.getInputFields().get(0).getDataset());
252253
assertEquals("col_c", dataset_c.getInputFields().get(0).getField());
253-
assertEquals("type2", dataset_c.getTransformationType());
254-
assertEquals("description2", dataset_c.getTransformationDescription());
254+
assertEquals("type2", dataset_c.getInputFields().get(0).getTransformationType());
255+
assertEquals("description2", dataset_c.getInputFields().get(0).getTransformationDescription());
255256

256257
// test dataset_b
257258
assertThat(dataset_b.getInputFields()).hasSize(2);
@@ -273,8 +274,8 @@ void testGetLineage() {
273274

274275
assertEquals("namespace", dataset_b.getInputFields().get(0).getNamespace());
275276
assertEquals("dataset_a", dataset_b.getInputFields().get(0).getDataset());
276-
assertEquals("type1", dataset_b.getTransformationType());
277-
assertEquals("description1", dataset_b.getTransformationDescription());
277+
assertEquals("type1", dataset_b.getInputFields().get(0).getTransformationType());
278+
assertEquals("description1", dataset_b.getInputFields().get(0).getTransformationDescription());
278279
}
279280

280281
@Test
@@ -483,6 +484,47 @@ void testGetLineageWhenDataTypeIsEmpty() {
483484
getColumnLineage(lineageRow, "col_c");
484485
}
485486

487+
@Test
488+
void testGetLineageRowsForDatasetsWhenMultipleJobsWriteToADataset() {
489+
List<LineageEvent.ColumnLineageInputField> fields =
490+
getDatasetB()
491+
.getFacets()
492+
.getColumnLineage()
493+
.getFields()
494+
.getAdditionalFacets()
495+
.get("col_c")
496+
.getInputFields();
497+
498+
Dataset datasetWithColAAsInputField = getDatasetB();
499+
datasetWithColAAsInputField
500+
.getFacets()
501+
.getColumnLineage()
502+
.getFields()
503+
.getAdditionalFacets()
504+
.get("col_c")
505+
.setInputFields(Collections.singletonList(fields.get(0)));
506+
createLineage(openLineageDao, getDatasetA(), datasetWithColAAsInputField);
507+
508+
Dataset datasetWithColBAsInputField = getDatasetB();
509+
datasetWithColBAsInputField
510+
.getFacets()
511+
.getColumnLineage()
512+
.getFields()
513+
.getAdditionalFacets()
514+
.get("col_c")
515+
.setInputFields(Collections.singletonList(fields.get(1)));
516+
createLineage(openLineageDao, getDatasetA(), datasetWithColBAsInputField);
517+
518+
List<InputFieldNodeData> inputFields =
519+
dao
520+
.getLineageRowsForDatasets(Collections.singletonList(Pair.of("namespace", "dataset_b")))
521+
.stream()
522+
.findAny()
523+
.get()
524+
.getInputFields();
525+
assertThat(inputFields).hasSize(2); // should contain col_a and col_b
526+
}
527+
486528
private Set<ColumnLineageNodeData> getColumnLineage(UpdateLineageRow lineageRow, String field) {
487529
UpdateLineageRow.DatasetRecord datasetRecord = lineageRow.getOutputs().get().get(0);
488530
UUID field_UUID = fieldDao.findUuid(datasetRecord.getDatasetRow().getUuid(), field).get();

api/src/test/java/marquez/service/ColumnLineageServiceTest.java

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,8 @@ public void testLineageByDatasetFieldId() {
9393
Node col_c = getNode(lineage, "dataset_b", "col_c").get();
9494
List<InputFieldNodeData> inputFields =
9595
((ColumnLineageNodeData) col_c.getData()).getInputFields();
96-
assertEquals(
97-
"description1", ((ColumnLineageNodeData) col_c.getData()).getTransformationDescription());
98-
assertEquals("type1", ((ColumnLineageNodeData) col_c.getData()).getTransformationType());
96+
assertEquals("description1", inputFields.get(0).getTransformationDescription());
97+
assertEquals("type1", inputFields.get(0).getTransformationType());
9998
assertEquals("STRING", ((ColumnLineageNodeData) col_c.getData()).getFieldType());
10099
assertThat(inputFields).hasSize(2);
101100
assertEquals("dataset_a", inputFields.get(0).getDataset());
@@ -195,28 +194,27 @@ public void testEnrichDatasets() {
195194

196195
assertThat(dataset_b.getColumnLineage()).hasSize(1);
197196
assertThat(dataset_b.getColumnLineage().get(0).getName()).isEqualTo("col_c");
198-
assertThat(dataset_b.getColumnLineage().get(0).getTransformationType()).isEqualTo("type1");
199-
assertThat(dataset_b.getColumnLineage().get(0).getTransformationDescription())
200-
.isEqualTo("description1");
201197

202198
List<ColumnLineageInputField> inputFields_b =
203199
dataset_b.getColumnLineage().get(0).getInputFields();
204200
assertThat(inputFields_b)
205201
.hasSize(2)
206-
.contains(new ColumnLineageInputField("namespace", "dataset_a", "col_a"))
207-
.contains(new ColumnLineageInputField("namespace", "dataset_a", "col_b"));
202+
.contains(
203+
new ColumnLineageInputField("namespace", "dataset_a", "col_a", "description1", "type1"))
204+
.contains(
205+
new ColumnLineageInputField(
206+
"namespace", "dataset_a", "col_b", "description1", "type1"));
208207

209208
assertThat(dataset_c.getColumnLineage()).hasSize(1);
210209
assertThat(dataset_c.getColumnLineage().get(0).getName()).isEqualTo("col_d");
211-
assertThat(dataset_c.getColumnLineage().get(0).getTransformationType()).isEqualTo("type2");
212-
assertThat(dataset_c.getColumnLineage().get(0).getTransformationDescription())
213-
.isEqualTo("description2");
214210

215211
List<ColumnLineageInputField> inputFields_c =
216212
dataset_c.getColumnLineage().get(0).getInputFields();
217213
assertThat(inputFields_c)
218214
.hasSize(1)
219-
.contains(new ColumnLineageInputField("namespace", "dataset_b", "col_c"));
215+
.contains(
216+
new ColumnLineageInputField(
217+
"namespace", "dataset_b", "col_c", "description2", "type2"));
220218
}
221219

222220
@Test

clients/java/src/main/java/marquez/client/models/ColumnLineage.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,5 @@
1818
@Getter
1919
public class ColumnLineage {
2020
@NonNull private String name;
21-
@NonNull private List<DatasetFieldId> inputFields;
22-
@NonNull private String transformationDescription;
23-
@NonNull private String transformationType;
21+
@NonNull private List<ColumnLineageInputField> inputFields;
2422
}

0 commit comments

Comments
 (0)