Skip to content

Commit 135553e

Browse files
authored
Merge branch 'main' into fix/runstatecolor
2 parents 27ad949 + cf0ba3e commit 135553e

31 files changed

Lines changed: 834 additions & 140 deletions

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,15 @@
33
## [Unreleased](https://github.com/MarquezProject/marquez/compare/0.32.0...HEAD)
44

55
### Fixed
6+
67
* UI: better handling of null job latestRun for Jobs page [#2467](https://github.com/MarquezProject/marquez/pull/2467) [@perttus](https://github.com/perttus)
8+
9+
### Added
10+
11+
* Support `inputFacets` and `outputFacets` from Openlineage specificatio [`#2417`](https://github.com/MarquezProject/marquez/pull/2417) [@pawel-big-lebowski]( https://github.com/pawel-big-lebowski)
12+
*Adds the ability to store `inputFacets` / `outputFacets` which are sent within datasets.*
13+
*Expose them through Marquez API as a member of `Run` resource.*
14+
715
## [0.32.0](https://github.com/MarquezProject/marquez/compare/0.31.0...0.32.0) - 2023-03-20
816

917
### Fixed

COMMITTERS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ They take responsibility for guiding new pull requests into the main branch.
2424
| Michael Robinson | [@merobi-hub](https://github.com/merobi-hub) |
2525
| Ross Turk | [@rossturk](https://github.com/rossturk) |
2626
| Minkyu Park | [@fm100](https://github.com/fm100) |
27-
| Pawel Leszczynski | [@pawel-big-lebowski](https://github.com/pawel-big-lebowski) |
27+
| Paweł Leszczyński | [@pawel-big-lebowski](https://github.com/pawel-big-lebowski) |
2828
2929
## Emeritus
3030

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* Copyright 2018-2023 contributors to the Marquez project
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package marquez.common.models;
7+
8+
import com.fasterxml.jackson.annotation.JsonProperty;
9+
import com.google.common.collect.ImmutableMap;
10+
import lombok.EqualsAndHashCode;
11+
import lombok.Getter;
12+
import lombok.NonNull;
13+
import lombok.ToString;
14+
15+
/**
16+
* Class used to store dataset version and `inputFacets` which are assigned to datasets within
17+
* OpenLineage spec, but are exposed within Marquez api as a part of {@link
18+
* marquez.service.models.Run}
19+
*/
20+
@EqualsAndHashCode
21+
@ToString
22+
@Getter
23+
public class InputDatasetVersion {
24+
25+
private final DatasetVersionId datasetVersionId;
26+
private final ImmutableMap<String, Object> facets;
27+
28+
public InputDatasetVersion(
29+
@JsonProperty("datasetVersionId") @NonNull DatasetVersionId datasetVersionId,
30+
@JsonProperty("facets") @NonNull ImmutableMap<String, Object> facets) {
31+
this.datasetVersionId = datasetVersionId;
32+
this.facets = facets;
33+
}
34+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* Copyright 2018-2023 contributors to the Marquez project
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package marquez.common.models;
7+
8+
import com.fasterxml.jackson.annotation.JsonProperty;
9+
import com.google.common.collect.ImmutableMap;
10+
import lombok.EqualsAndHashCode;
11+
import lombok.Getter;
12+
import lombok.NonNull;
13+
import lombok.ToString;
14+
15+
/**
16+
* Class used to store dataset version and `outputFacets` which are assigned to datasets within
17+
* OpenLineage spec, but are exposed within Marquez api as a part of {@link
18+
* marquez.service.models.Run}
19+
*/
20+
@EqualsAndHashCode
21+
@ToString
22+
@Getter
23+
public class OutputDatasetVersion {
24+
25+
private final DatasetVersionId datasetVersionId;
26+
private final ImmutableMap<String, Object> facets;
27+
28+
public OutputDatasetVersion(
29+
@JsonProperty("datasetVersionId") @NonNull DatasetVersionId datasetVersionId,
30+
@JsonProperty("facets") @NonNull ImmutableMap<String, Object> facets) {
31+
this.datasetVersionId = datasetVersionId;
32+
this.facets = facets;
33+
}
34+
}

api/src/main/java/marquez/db/Columns.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ private Columns() {}
5555
public static final String NAMESPACE_NAME = "namespace_name";
5656
public static final String DATASET_NAME = "dataset_name";
5757
public static final String FACETS = "facets";
58+
public static final String DATASET_FACETS = "dataset_facets";
5859
public static final String TAGS = "tags";
5960
public static final String IS_HIDDEN = "is_hidden";
6061

api/src/main/java/marquez/db/DatasetDao.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ LEFT JOIN (
8686
df.dataset_version_uuid,
8787
JSONB_AGG(df.facet ORDER BY df.lineage_event_time ASC) AS facets
8888
FROM dataset_facets_view AS df
89-
WHERE df.facet IS NOT NULL
89+
WHERE df.facet IS NOT NULL AND (df.type ILIKE 'dataset' OR df.type ILIKE 'unknown')
9090
GROUP BY df.dataset_version_uuid
9191
) f ON f.dataset_version_uuid = d.current_version_uuid
9292
WHERE CAST((:namespaceName, :datasetName) AS DATASET_NAME) = ANY(d.dataset_symlinks)
@@ -134,7 +134,7 @@ LEFT JOIN (
134134
df.dataset_version_uuid,
135135
JSONB_AGG(df.facet ORDER BY df.lineage_event_time ASC) AS facets
136136
FROM dataset_facets_view AS df
137-
WHERE df.facet IS NOT NULL
137+
WHERE df.facet IS NOT NULL AND (df.type ILIKE 'dataset' OR df.type ILIKE 'unknown')
138138
GROUP BY df.dataset_version_uuid
139139
) f ON f.dataset_version_uuid = d.current_version_uuid
140140
WHERE d.namespace_name = :namespaceName

api/src/main/java/marquez/db/DatasetFacetsDao.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,58 @@ default void insertDatasetFacetsFor(
149149
FacetUtils.toPgObject(fieldName, jsonNode.get(fieldName))));
150150
}
151151

152+
default void insertInputDatasetFacetsFor(
153+
@NonNull UUID datasetUuid,
154+
@NonNull UUID datasetVersionUuid,
155+
@NonNull UUID runUuid,
156+
@NonNull Instant lineageEventTime,
157+
@NonNull String lineageEventType,
158+
@NonNull LineageEvent.InputDatasetFacets inputFacets) {
159+
final Instant now = Instant.now();
160+
161+
JsonNode jsonNode = Utils.getMapper().valueToTree(inputFacets);
162+
StreamSupport.stream(
163+
Spliterators.spliteratorUnknownSize(jsonNode.fieldNames(), Spliterator.DISTINCT), false)
164+
.forEach(
165+
fieldName ->
166+
insertDatasetFacet(
167+
now,
168+
datasetUuid,
169+
datasetVersionUuid,
170+
runUuid,
171+
lineageEventTime,
172+
lineageEventType,
173+
Type.INPUT,
174+
fieldName,
175+
FacetUtils.toPgObject(fieldName, jsonNode.get(fieldName))));
176+
}
177+
178+
default void insertOutputDatasetFacetsFor(
179+
@NonNull UUID datasetUuid,
180+
@NonNull UUID datasetVersionUuid,
181+
@NonNull UUID runUuid,
182+
@NonNull Instant lineageEventTime,
183+
@NonNull String lineageEventType,
184+
@NonNull LineageEvent.OutputDatasetFacets outputFacets) {
185+
final Instant now = Instant.now();
186+
187+
JsonNode jsonNode = Utils.getMapper().valueToTree(outputFacets);
188+
StreamSupport.stream(
189+
Spliterators.spliteratorUnknownSize(jsonNode.fieldNames(), Spliterator.DISTINCT), false)
190+
.forEach(
191+
fieldName ->
192+
insertDatasetFacet(
193+
now,
194+
datasetUuid,
195+
datasetVersionUuid,
196+
runUuid,
197+
lineageEventTime,
198+
lineageEventType,
199+
Type.OUTPUT,
200+
fieldName,
201+
FacetUtils.toPgObject(fieldName, jsonNode.get(fieldName))));
202+
}
203+
152204
record DatasetFacetRow(
153205
Instant createdAt,
154206
UUID datasetUuid,

api/src/main/java/marquez/db/DatasetVersionDao.java

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,19 @@ default void updateDatasetVersionMetric(
156156

157157
@SqlQuery(
158158
"""
159+
WITH selected_dataset_versions AS (
160+
SELECT dv.*
161+
FROM dataset_versions dv
162+
WHERE dv.version = :version
163+
), selected_dataset_version_facets AS (
164+
SELECT dv.uuid, dv.dataset_name, dv.namespace_name, df.run_uuid, df.lineage_event_time, df.facet
165+
FROM selected_dataset_versions dv
166+
LEFT JOIN dataset_facets_view df ON df.dataset_version_uuid = dv.uuid
167+
)
159168
SELECT d.type, d.name, d.physical_name, d.namespace_name, d.source_name, d.description, dv.lifecycle_state,\s
160169
dv.created_at, dv.version, dv.fields, dv.run_uuid AS createdByRunUuid, sv.schema_location,
161170
t.tags, f.facets
162-
FROM dataset_versions dv
171+
FROM selected_dataset_versions dv
163172
LEFT JOIN datasets_view d ON d.uuid = dv.dataset_uuid
164173
LEFT JOIN stream_versions AS sv ON sv.dataset_version_uuid = dv.uuid
165174
LEFT JOIN (
@@ -169,21 +178,28 @@ SELECT ARRAY_AGG(t.name) AS tags, m.dataset_uuid
169178
GROUP BY m.dataset_uuid
170179
) t ON t.dataset_uuid = dv.dataset_uuid
171180
LEFT JOIN (
172-
SELECT dvf.dataset_version_uuid,
173-
JSONB_AGG(dvf.facet ORDER BY dvf.lineage_event_time ASC) AS facets
174-
FROM dataset_facets_view dvf
175-
GROUP BY dataset_version_uuid
176-
) f ON f.dataset_version_uuid = dv.uuid
177-
WHERE dv.version = :version
178-
""")
181+
SELECT dvf.uuid AS dataset_uuid, JSONB_AGG(dvf.facet ORDER BY dvf.lineage_event_time ASC) AS facets
182+
FROM selected_dataset_version_facets dvf
183+
WHERE dvf.run_uuid = dvf.run_uuid
184+
GROUP BY dvf.uuid
185+
) f ON f.dataset_uuid = dv.uuid""")
179186
Optional<DatasetVersion> findBy(UUID version);
180187

181188
@SqlQuery(
182189
"""
190+
WITH selected_dataset_versions AS (
191+
SELECT dv.*
192+
FROM dataset_versions dv
193+
WHERE dv.uuid = :uuid
194+
), selected_dataset_version_facets AS (
195+
SELECT dv.uuid, dv.dataset_name, dv.namespace_name, df.run_uuid, df.lineage_event_time, df.facet
196+
FROM selected_dataset_versions dv
197+
LEFT JOIN dataset_facets_view df ON df.dataset_version_uuid = dv.uuid AND (df.type ILIKE 'dataset' OR df.type ILIKE 'unknown')
198+
)
183199
SELECT d.type, d.name, d.physical_name, d.namespace_name, d.source_name, d.description, dv.lifecycle_state,\s
184200
dv.created_at, dv.version, dv.fields, dv.run_uuid AS createdByRunUuid, sv.schema_location,
185201
t.tags, f.facets
186-
FROM dataset_versions dv
202+
FROM selected_dataset_versions dv
187203
LEFT JOIN datasets_view d ON d.uuid = dv.dataset_uuid
188204
LEFT JOIN stream_versions AS sv ON sv.dataset_version_uuid = dv.uuid
189205
LEFT JOIN (
@@ -192,14 +208,12 @@ SELECT ARRAY_AGG(t.name) AS tags, m.dataset_uuid
192208
INNER JOIN datasets_tag_mapping AS m ON m.tag_uuid = t.uuid
193209
GROUP BY m.dataset_uuid
194210
) t ON t.dataset_uuid = dv.dataset_uuid
195-
LEFT JOIN (
196-
SELECT dvf.dataset_version_uuid,
197-
JSONB_AGG(dvf.facet ORDER BY dvf.lineage_event_time ASC) AS facets
198-
FROM dataset_facets_view dvf
199-
GROUP BY dataset_version_uuid
200-
) f ON f.dataset_version_uuid = dv.uuid
201-
WHERE dv.uuid = :uuid
202-
""")
211+
LEFT JOIN (
212+
SELECT dvf.uuid AS dataset_uuid, JSONB_AGG(dvf.facet ORDER BY dvf.lineage_event_time ASC) AS facets
213+
FROM selected_dataset_version_facets dvf
214+
WHERE dvf.run_uuid = dvf.run_uuid
215+
GROUP BY dvf.uuid
216+
) f ON f.dataset_uuid = dv.uuid""")
203217
Optional<DatasetVersion> findByUuid(UUID uuid);
204218

205219
default Optional<DatasetVersion> findByWithRun(UUID version) {
@@ -246,6 +260,7 @@ LEFT JOIN (
246260
SELECT dvf.dataset_version_uuid,
247261
JSONB_AGG(dvf.facet ORDER BY dvf.lineage_event_time ASC) AS facets
248262
FROM dataset_facets_view dvf
263+
WHERE (type ILIKE 'dataset' OR type ILIKE 'unknown')
249264
GROUP BY dataset_version_uuid
250265
) f ON f.dataset_version_uuid = dv.uuid
251266
WHERE dv.namespace_name = :namespaceName

api/src/main/java/marquez/db/LineageDao.java

Lines changed: 39 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -38,37 +38,45 @@ public interface LineageDao {
3838
*/
3939
@SqlQuery(
4040
"""
41-
WITH RECURSIVE
42-
job_io AS (
43-
SELECT COALESCE(j.symlink_target_uuid, j.uuid) AS job_uuid,
44-
ARRAY_AGG(DISTINCT j.uuid) AS ids,
45-
ARRAY_AGG(DISTINCT io.dataset_uuid) FILTER (WHERE io_type='INPUT') AS inputs,
46-
ARRAY_AGG(DISTINCT io.dataset_uuid) FILTER (WHERE io_type='OUTPUT') AS outputs
47-
FROM jobs j
48-
LEFT JOIN jobs_view s On s.uuid=j.symlink_target_uuid
49-
LEFT JOIN job_versions v on v.uuid=COALESCE(s.current_version_uuid, j.current_version_uuid)
50-
LEFT JOIN job_versions_io_mapping io ON io.job_version_uuid=v.uuid
51-
GROUP BY COALESCE(j.symlink_target_uuid, j.uuid)
52-
),
53-
lineage(job_uuid, inputs, outputs) AS (
54-
SELECT COALESCE(j.symlink_target_uuid, j.uuid) AS job_uuid,
55-
COALESCE(inputs, Array[]::uuid[]) AS inputs,
56-
COALESCE(outputs, Array[]::uuid[]) AS outputs,
57-
0 AS depth
58-
FROM jobs_view j
59-
INNER JOIN job_io io ON j.uuid=ANY(io.ids)
60-
WHERE io.ids && ARRAY[<jobIds>]::uuid[]
61-
UNION
62-
SELECT io.job_uuid, io.inputs, io.outputs, l.depth + 1
63-
FROM job_io io,
64-
lineage l
65-
WHERE io.job_uuid != l.job_uuid AND
66-
array_cat(io.inputs, io.outputs) && array_cat(l.inputs, l.outputs)
67-
AND depth < :depth)
68-
SELECT DISTINCT ON (j.uuid) j.*, inputs AS input_uuids, outputs AS output_uuids, jc.context
69-
FROM lineage l2
70-
INNER JOIN jobs_view j ON j.uuid=l2.job_uuid
71-
LEFT JOIN job_contexts jc on jc.uuid = j.current_job_context_uuid;
41+
WITH RECURSIVE
42+
-- Find the current version of a job or its symlink target if the target has no
43+
-- current_version_uuid. This ensures that we don't lose lineage for a job after it is
44+
-- symlinked to another job but before that target job has run successfully.
45+
job_current_version AS (
46+
SELECT COALESCE(j.symlink_target_uuid, j.uuid) AS job_uuid,
47+
COALESCE(s.current_version_uuid, j.current_version_uuid) AS job_version_uuid
48+
FROM jobs j
49+
LEFT JOIN jobs s ON s.uuid=j.symlink_target_uuid
50+
WHERE s.current_version_uuid IS NULL
51+
),
52+
job_io AS (
53+
SELECT j.job_uuid,
54+
ARRAY_AGG(DISTINCT io.dataset_uuid) FILTER (WHERE io_type='INPUT') AS inputs,
55+
ARRAY_AGG(DISTINCT io.dataset_uuid) FILTER (WHERE io_type='OUTPUT') AS outputs
56+
FROM job_versions_io_mapping io
57+
INNER JOIN job_current_version j ON io.job_version_uuid=j.job_version_uuid
58+
GROUP BY j.job_uuid
59+
),
60+
lineage(job_uuid, inputs, outputs) AS (
61+
SELECT v.job_uuid AS job_uuid,
62+
COALESCE(inputs, Array[]::uuid[]) AS inputs,
63+
COALESCE(outputs, Array[]::uuid[]) AS outputs,
64+
0 AS depth
65+
FROM jobs j
66+
INNER JOIN job_current_version v ON (j.symlink_target_uuid IS NULL AND j.uuid=v.job_uuid) OR v.job_uuid=j.symlink_target_uuid
67+
LEFT JOIN job_io io ON io.job_uuid=v.job_uuid
68+
WHERE j.uuid IN (<jobIds>) OR j.symlink_target_uuid IN (<jobIds>)
69+
UNION
70+
SELECT io.job_uuid, io.inputs, io.outputs, l.depth + 1
71+
FROM job_io io,
72+
lineage l
73+
WHERE io.job_uuid != l.job_uuid AND
74+
array_cat(io.inputs, io.outputs) && array_cat(l.inputs, l.outputs)
75+
AND depth < :depth)
76+
SELECT DISTINCT ON (j.uuid) j.*, inputs AS input_uuids, outputs AS output_uuids, jc.context
77+
FROM lineage l2
78+
INNER JOIN jobs_view j ON j.uuid=l2.job_uuid
79+
LEFT JOIN job_contexts jc on jc.uuid = j.current_job_context_uuid;
7280
""")
7381
Set<JobData> getLineage(@BindList Set<UUID> jobIds, int depth);
7482

api/src/main/java/marquez/db/OpenLineageDao.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,18 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
279279
now,
280280
event.getEventType(),
281281
facets));
282+
283+
// InputFacets ...
284+
Optional.ofNullable(dataset.getInputFacets())
285+
.ifPresent(
286+
facets ->
287+
datasetFacetsDao.insertInputDatasetFacetsFor(
288+
record.getDatasetRow().getUuid(),
289+
record.getDatasetVersionRow().getUuid(),
290+
runUuid,
291+
now,
292+
event.getEventType(),
293+
facets));
282294
}
283295
}
284296
bag.setInputs(Optional.ofNullable(datasetInputs));
@@ -314,6 +326,18 @@ default UpdateLineageRow updateBaseMarquezModel(LineageEvent event, ObjectMapper
314326
now,
315327
event.getEventType(),
316328
facets));
329+
330+
// OutputFacets ...
331+
Optional.ofNullable(dataset.getOutputFacets())
332+
.ifPresent(
333+
facets ->
334+
datasetFacetsDao.insertOutputDatasetFacetsFor(
335+
record.getDatasetRow().getUuid(),
336+
record.getDatasetVersionRow().getUuid(),
337+
runUuid,
338+
now,
339+
event.getEventType(),
340+
facets));
317341
}
318342
}
319343

0 commit comments

Comments
 (0)