Skip to content

Commit cf50ad0

Browse files
committed
delete: add possibility to soft-delete datasets
Signed-off-by: Maciej Obuchowski <obuchowski.maciej@gmail.com>
1 parent 7a7a2a8 commit cf50ad0

11 files changed

Lines changed: 326 additions & 58 deletions

File tree

api/src/main/java/marquez/api/DatasetResource.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import javax.validation.Valid;
1818
import javax.validation.constraints.Min;
1919
import javax.ws.rs.Consumes;
20+
import javax.ws.rs.DELETE;
2021
import javax.ws.rs.DefaultValue;
2122
import javax.ws.rs.GET;
2223
import javax.ws.rs.POST;
@@ -150,6 +151,27 @@ public Response list(
150151
return Response.ok(new ResultsPage<>("datasets", datasets, totalCount)).build();
151152
}
152153

154+
@Timed
155+
@ResponseMetered
156+
@ExceptionMetered
157+
@DELETE
158+
@Path("{dataset}")
159+
@Produces(APPLICATION_JSON)
160+
public Response delete(
161+
@PathParam("namespace") NamespaceName namespaceName,
162+
@PathParam("dataset") DatasetName datasetName) {
163+
throwIfNotExists(namespaceName);
164+
165+
datasetService
166+
.softDelete(namespaceName.getValue(), datasetName.getValue())
167+
.orElseThrow(() -> new DatasetNotFoundException(datasetName));
168+
Dataset dataset =
169+
datasetService
170+
.findDatasetByName(namespaceName.getValue(), datasetName.getValue())
171+
.orElseThrow(() -> new DatasetNotFoundException(datasetName));
172+
return Response.ok(dataset).build();
173+
}
174+
153175
@Timed
154176
@ResponseMetered
155177
@ExceptionMetered

api/src/main/java/marquez/db/DatasetDao.java

Lines changed: 57 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -138,51 +138,53 @@ default void setFields(Dataset ds) {
138138
Optional<DatasetRow> getUuid(String namespaceName, String datasetName);
139139

140140
@SqlQuery(
141-
"WITH selected_datasets AS (\n"
142-
+ " SELECT d.*\n"
143-
+ " FROM datasets d\n"
144-
+ " WHERE d.namespace_name = :namespaceName\n"
145-
+ " ORDER BY d.name\n"
146-
+ " LIMIT :limit OFFSET :offset\n"
147-
+ "), dataset_runs AS (\n"
148-
+ " SELECT d.uuid, d.name, d.namespace_name, dv.run_uuid, dv.lifecycle_state, event_time, event\n"
149-
+ " FROM selected_datasets d\n"
150-
+ " INNER JOIN dataset_versions dv ON dv.uuid = d.current_version_uuid\n"
151-
+ " LEFT JOIN LATERAL (\n"
152-
+ " SELECT run_uuid, event_time, event FROM lineage_events\n"
153-
+ " WHERE run_uuid = dv.run_uuid\n"
154-
+ " ) e ON e.run_uuid = dv.run_uuid\n"
155-
+ " UNION\n"
156-
+ " SELECT d.uuid, d.name, d.namespace_name, rim.run_uuid, lifecycle_state, event_time, event\n"
157-
+ " FROM selected_datasets d\n"
158-
+ " INNER JOIN dataset_versions dv ON dv.uuid = d.current_version_uuid\n"
159-
+ " LEFT JOIN runs_input_mapping rim ON dv.uuid = rim.dataset_version_uuid\n"
160-
+ " LEFT JOIN LATERAL (\n"
161-
+ " SELECT run_uuid, event_time, event FROM lineage_events\n"
162-
+ " WHERE run_uuid = rim.run_uuid\n"
163-
+ " ) e ON e.run_uuid = rim.run_uuid\n"
164-
+ ")\n"
165-
+ "SELECT d.*, dv.fields, dv.lifecycle_state, sv.schema_location, t.tags, facets\n"
166-
+ "FROM selected_datasets d\n"
167-
+ "LEFT JOIN dataset_versions dv ON d.current_version_uuid = dv.uuid\n"
168-
+ "LEFT JOIN stream_versions AS sv ON sv.dataset_version_uuid = dv.uuid\n"
169-
+ "LEFT JOIN (\n"
170-
+ " SELECT ARRAY_AGG(t.name) AS tags, m.dataset_uuid\n"
171-
+ " FROM tags AS t\n"
172-
+ " INNER JOIN datasets_tag_mapping AS m ON m.tag_uuid = t.uuid\n"
173-
+ " GROUP BY m.dataset_uuid\n"
174-
+ ") t ON t.dataset_uuid = d.uuid\n"
175-
+ "LEFT JOIN (\n"
176-
+ " SELECT d2.uuid AS dataset_uuid, JSONB_AGG(ds->'facets' ORDER BY event_time) AS facets\n"
177-
+ " FROM dataset_runs d2,\n"
178-
+ " jsonb_array_elements(coalesce(d2.event -> 'inputs', '[]'::jsonb) || coalesce(d2.event -> 'outputs', '[]'::jsonb)) AS ds\n"
179-
+ " WHERE d2.run_uuid = d2.run_uuid\n"
180-
+ " AND ds -> 'facets' IS NOT NULL\n"
181-
+ " AND ds ->> 'name' = d2.name\n"
182-
+ " AND ds ->> 'namespace' = d2.namespace_name\n"
183-
+ " GROUP BY d2.uuid\n"
184-
+ ") f ON f.dataset_uuid = d.uuid\n"
185-
+ "ORDER BY d.name")
141+
"""
142+
WITH selected_datasets AS (
143+
SELECT d.*
144+
FROM datasets d
145+
WHERE d.namespace_name = :namespaceName
146+
AND d.is_deleted is false
147+
ORDER BY d.name
148+
LIMIT :limit OFFSET :offset
149+
), dataset_runs AS (
150+
SELECT d.uuid, d.name, d.namespace_name, dv.run_uuid, dv.lifecycle_state, event_time, event
151+
FROM selected_datasets d
152+
INNER JOIN dataset_versions dv ON dv.uuid = d.current_version_uuid
153+
LEFT JOIN LATERAL (
154+
SELECT run_uuid, event_time, event FROM lineage_events
155+
WHERE run_uuid = dv.run_uuid
156+
) e ON e.run_uuid = dv.run_uuid
157+
UNION
158+
SELECT d.uuid, d.name, d.namespace_name, rim.run_uuid, lifecycle_state, event_time, event
159+
FROM selected_datasets d
160+
INNER JOIN dataset_versions dv ON dv.uuid = d.current_version_uuid
161+
LEFT JOIN runs_input_mapping rim ON dv.uuid = rim.dataset_version_uuid
162+
LEFT JOIN LATERAL (
163+
SELECT run_uuid, event_time, event FROM lineage_events
164+
WHERE run_uuid = rim.run_uuid
165+
) e ON e.run_uuid = rim.run_uuid
166+
)
167+
SELECT d.*, dv.fields, dv.lifecycle_state, sv.schema_location, t.tags, facets
168+
FROM selected_datasets d
169+
LEFT JOIN dataset_versions dv ON d.current_version_uuid = dv.uuid
170+
LEFT JOIN stream_versions AS sv ON sv.dataset_version_uuid = dv.uuid
171+
LEFT JOIN (
172+
SELECT ARRAY_AGG(t.name) AS tags, m.dataset_uuid
173+
FROM tags AS t
174+
INNER JOIN datasets_tag_mapping AS m ON m.tag_uuid = t.uuid
175+
GROUP BY m.dataset_uuid
176+
) t ON t.dataset_uuid = d.uuid
177+
LEFT JOIN (
178+
SELECT d2.uuid AS dataset_uuid, JSONB_AGG(ds->'facets' ORDER BY event_time) AS facets
179+
FROM dataset_runs d2,
180+
jsonb_array_elements(coalesce(d2.event -> 'inputs', '[]'::jsonb) || coalesce(d2.event -> 'outputs', '[]'::jsonb)) AS ds
181+
WHERE d2.run_uuid = d2.run_uuid
182+
AND ds -> 'facets' IS NOT NULL
183+
AND ds ->> 'name' = d2.name
184+
AND ds ->> 'namespace' = d2.namespace_name
185+
GROUP BY d2.uuid
186+
) f ON f.dataset_uuid = d.uuid
187+
ORDER BY d.name""")
186188
List<Dataset> findAll(String namespaceName, int limit, int offset);
187189

188190
@SqlQuery("SELECT count(*) FROM datasets")
@@ -284,6 +286,16 @@ DatasetRow upsert(
284286
String name,
285287
String physicalName);
286288

289+
@SqlQuery(
290+
"""
291+
UPDATE datasets
292+
SET is_deleted = true
293+
WHERE namespace_name = :namespaceName
294+
AND name = :name
295+
RETURNING *
296+
""")
297+
Optional<DatasetRow> softDelete(String namespaceName, String name);
298+
287299
@Transaction
288300
default Dataset upsertDatasetMeta(
289301
NamespaceName namespaceName, DatasetName datasetName, DatasetMeta datasetMeta) {

api/src/main/java/marquez/db/LineageDao.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,14 @@ public interface LineageDao {
6767
Set<JobData> getLineage(@BindList Set<UUID> jobIds, int depth);
6868

6969
@SqlQuery(
70-
"SELECT ds.*, dv.fields, dv.lifecycle_state\n"
71-
+ "FROM datasets ds\n"
72-
+ "LEFT JOIN dataset_versions dv on dv.uuid = ds.current_version_uuid\n"
73-
+ "WHERE ds.uuid IN (<dsUuids>);")
74-
Set<DatasetData> getDatasetData(@BindList Set<UUID> dsUuids);
70+
"""
71+
SELECT ds.*, dv.fields, dv.lifecycle_state
72+
FROM datasets ds
73+
LEFT JOIN dataset_versions dv on dv.uuid = ds.current_version_uuid
74+
WHERE ds.uuid IN (<dsUuids>)
75+
AND ds.is_deleted is false
76+
""")
77+
Set<DatasetData> getNonDeletedDatasetData(@BindList Set<UUID> dsUuids);
7578

7679
@SqlQuery(
7780
"select j.uuid from jobs j\n"

api/src/main/java/marquez/service/LineageService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ public Lineage lineage(NodeId nodeId, int depth) {
7575
.collect(Collectors.toSet());
7676
Set<DatasetData> datasets = new HashSet<>();
7777
if (!datasetIds.isEmpty()) {
78-
datasets.addAll(getDatasetData(datasetIds));
78+
datasets.addAll(this.getNonDeletedDatasetData(datasetIds));
7979
}
8080

8181
return toLineage(jobData, datasets);

api/src/test/java/marquez/DatasetIntegrationTest.java

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
import com.google.common.collect.ImmutableList;
1212
import com.google.common.collect.ImmutableMap;
1313
import com.google.common.collect.ImmutableSet;
14+
import java.io.IOException;
1415
import java.net.http.HttpResponse;
16+
import java.time.Instant;
17+
import java.time.ZoneId;
1518
import java.time.ZonedDateTime;
1619
import java.util.Collections;
1720
import java.util.List;
@@ -357,4 +360,84 @@ public void testApp_upsertDescription() {
357360
// Description stays
358361
assertThat(dataset2.getDescription()).isEqualTo(DESCRIPTION.getDescription());
359362
}
363+
364+
@Test
365+
public void testApp_doesNotShowDeletedDataset() throws IOException {
366+
String namespace = "namespace";
367+
String name = "table";
368+
LineageEvent event =
369+
new LineageEvent(
370+
"COMPLETE",
371+
Instant.now().atZone(ZoneId.systemDefault()),
372+
new LineageEvent.Run(UUID.randomUUID().toString(), null),
373+
new LineageEvent.Job("namespace", "job_name", null),
374+
List.of(new LineageEvent.Dataset(namespace, name, LineageTestUtils.newDatasetFacet())),
375+
Collections.emptyList(),
376+
"the_producer");
377+
378+
final CompletableFuture<Integer> resp =
379+
this.sendLineage(Utils.toJson(event))
380+
.thenApply(HttpResponse::statusCode)
381+
.whenComplete(
382+
(val, error) -> {
383+
if (error != null) {
384+
Assertions.fail("Could not complete request");
385+
}
386+
});
387+
388+
// Ensure the event was correctly rejected and a proper response code returned.
389+
assertThat(resp.join()).isEqualTo(201);
390+
391+
client.deleteDataset(namespace, name);
392+
393+
List<Dataset> datasets = client.listDatasets(namespace);
394+
assertThat(datasets).hasSize(0);
395+
}
396+
397+
@Test
398+
public void testApp_showsDeletedDatasetAfterReceivingNewVersion() throws IOException {
399+
String namespace = "namespace";
400+
String name = "anotherTable";
401+
LineageEvent event =
402+
new LineageEvent(
403+
"COMPLETE",
404+
Instant.now().atZone(ZoneId.systemDefault()),
405+
new LineageEvent.Run(UUID.randomUUID().toString(), null),
406+
new LineageEvent.Job("namespace", "job_name", null),
407+
List.of(new LineageEvent.Dataset(namespace, name, LineageTestUtils.newDatasetFacet())),
408+
Collections.emptyList(),
409+
"the_producer");
410+
411+
CompletableFuture<Integer> resp =
412+
this.sendLineage(Utils.toJson(event))
413+
.thenApply(HttpResponse::statusCode)
414+
.whenComplete(
415+
(val, error) -> {
416+
if (error != null) {
417+
Assertions.fail("Could not complete request");
418+
}
419+
});
420+
421+
// Ensure the event was correctly rejected and a proper response code returned.
422+
assertThat(resp.join()).isEqualTo(201);
423+
424+
client.deleteDataset(namespace, name);
425+
426+
List<Dataset> datasets = client.listDatasets(namespace);
427+
assertThat(datasets).hasSize(0);
428+
resp = this.sendLineage(Utils.toJson(event))
429+
.thenApply(HttpResponse::statusCode)
430+
.whenComplete(
431+
(val, error) -> {
432+
if (error != null) {
433+
Assertions.fail("Could not complete request");
434+
}
435+
});
436+
437+
assertThat(resp.join()).isEqualTo(201);
438+
439+
datasets = client.listDatasets(namespace);
440+
assertThat(datasets).hasSize(1);
441+
}
442+
360443
}

api/src/test/java/marquez/db/DatasetDaoTest.java

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -287,20 +287,22 @@ public void testGetDatasets() {
287287
ImmutableMap.of("writeFacet", new CustomValueFacet("firstWriteValue")))));
288288

289289
String secondDatasetName = "secondDataset";
290+
String deletedDatasetName = "deletedDataset";
290291
createLineageRow(
291292
openLineageDao,
292293
"secondWriteJob",
293294
"COMPLETE",
294295
jobFacet,
295296
Collections.emptyList(),
296-
Collections.singletonList(
297+
List.of(
297298
new Dataset(
298299
NAMESPACE,
299300
secondDatasetName,
300301
newDatasetFacet(
301302
ImmutableMap.of("writeFacet", new CustomValueFacet("secondWriteValue")),
302303
new SchemaField("age", "int", "the age"),
303-
new SchemaField("address", "string", "the address")))));
304+
new SchemaField("address", "string", "the address"))),
305+
new Dataset(NAMESPACE, deletedDatasetName, newDatasetFacet())));
304306

305307
createLineageRow(
306308
openLineageDao,
@@ -319,6 +321,11 @@ public void testGetDatasets() {
319321
Collections.emptyList());
320322

321323
List<marquez.service.models.Dataset> datasets = datasetDao.findAll(NAMESPACE, 5, 0);
324+
assertThat(datasets).hasSize(3);
325+
326+
datasetDao.softDelete(NAMESPACE, deletedDatasetName);
327+
328+
datasets = datasetDao.findAll(NAMESPACE, 5, 0);
322329
assertThat(datasets).hasSize(2);
323330

324331
// datasets sorted alphabetically, so commonDataset is first
@@ -357,8 +364,7 @@ public void testGetDatasets() {
357364
InstanceOfAssertFactories.map(String.class, Object.class))
358365
.isNotEmpty()
359366
.hasSize(6)
360-
.containsKeys(
361-
"documentation", "description", "schema", "dataSource", "writeFacet", "inputFacet")
367+
.containsKeys("documentation", "description", "schema", "dataSource", "inputFacet")
362368
.containsEntry(
363369
"writeFacet",
364370
ImmutableMap.of(
@@ -379,6 +385,28 @@ public void testGetDatasets() {
379385
"http://test.schema/"));
380386
}
381387

388+
@Test
389+
public void testGetSpecificDatasetReturnsDatasetIfDeleted() {
390+
createLineageRow(
391+
openLineageDao,
392+
"writeJob",
393+
"COMPLETE",
394+
jobFacet,
395+
Collections.emptyList(),
396+
Collections.singletonList(newCommonDataset(Collections.emptyMap())));
397+
398+
marquez.service.models.Dataset dataset = datasetDao.findDatasetByName(NAMESPACE, DATASET).get();
399+
400+
assertThat(dataset)
401+
.matches(ds -> ds.getName().getValue().equals(DATASET))
402+
.extracting(
403+
marquez.service.models.Dataset::getFacets,
404+
InstanceOfAssertFactories.map(String.class, Object.class))
405+
.isNotEmpty()
406+
.hasSize(4)
407+
.containsKeys("documentation", "description", "schema", "dataSource");
408+
}
409+
382410
@Test
383411
public void testGetDatasetsWithMultipleVersions() {
384412
String secondDatasetName = "secondDataset";

0 commit comments

Comments
 (0)