Skip to content

Commit 90feb2e

Browse files
authored
Fix/symlinked jobs in queries (#2053)
* Update jobs update function to dedupe aliases Signed-off-by: Michael Collado <collado.mike@gmail.com> * Update search query to accommodate symlinked jobs and aliases Signed-off-by: Michael Collado <collado.mike@gmail.com> * Update lineage query to include symlinked jobs in lineage Signed-off-by: Michael Collado <collado.mike@gmail.com> * Updated search test to validate symlink target jobs are returned Signed-off-by: Michael Collado <collado.mike@gmail.com>
1 parent 81972b0 commit 90feb2e

5 files changed

Lines changed: 181 additions & 10 deletions

File tree

api/src/main/java/marquez/db/LineageDao.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,9 @@ public interface LineageDao {
4343
+ " SELECT j.uuid AS job_uuid,\n"
4444
+ " ARRAY_AGG(DISTINCT io.dataset_uuid) FILTER (WHERE io_type='INPUT') AS inputs,\n"
4545
+ " ARRAY_AGG(DISTINCT io.dataset_uuid) FILTER (WHERE io_type='OUTPUT') AS outputs\n"
46-
+ " FROM jobs j\n"
47-
+ " LEFT JOIN job_versions v on j.current_version_uuid = v.uuid\n"
46+
+ " FROM jobs_view j\n"
47+
+ " LEFT JOIN jobs_view s ON s.symlink_target_uuid=j.uuid\n"
48+
+ " LEFT JOIN job_versions v on COALESCE(j.current_version_uuid, s.current_version_uuid) = v.uuid\n"
4849
+ " LEFT JOIN job_versions_io_mapping io on v.uuid = io.job_version_uuid\n"
4950
+ " GROUP BY j.uuid\n"
5051
+ " ),\n"
@@ -60,9 +61,10 @@ public interface LineageDao {
6061
+ " array_cat(io.inputs, io.outputs) && array_cat(l.inputs, l.outputs)\n"
6162
+ " AND depth < :depth"
6263
+ " )\n"
63-
+ "SELECT DISTINCT ON (l2.job_uuid) j.*, inputs AS input_uuids, outputs AS output_uuids, jc.context\n"
64+
+ "SELECT DISTINCT ON (j.uuid) j.*, inputs AS input_uuids, outputs AS output_uuids, jc.context\n"
6465
+ "FROM lineage l2\n"
65-
+ "INNER JOIN jobs_view j ON j.uuid=l2.job_uuid\n"
66+
+ "INNER JOIN jobs_view s ON s.uuid=l2.job_uuid\n"
67+
+ "INNER JOIN jobs_view j ON j.uuid=COALESCE(s.symlink_target_uuid, s.uuid)\n"
6668
+ "LEFT JOIN job_contexts jc on jc.uuid = j.current_job_context_uuid")
6769
Set<JobData> getLineage(@BindList Set<UUID> jobIds, int depth);
6870

@@ -88,7 +90,8 @@ public interface LineageDao {
8890
+ " SELECT DISTINCT on(r.job_name, r.namespace_name) r.*, jv.version\n"
8991
+ " FROM runs_view r\n"
9092
+ " INNER JOIN job_versions jv ON jv.uuid=r.job_version_uuid\n"
91-
+ " WHERE jv.job_uuid in (<jobUuid>)\n"
93+
+ " INNER JOIN jobs_view j ON j.uuid=jv.job_uuid\n"
94+
+ " WHERE j.uuid in (<jobUuid>) OR j.symlink_target_uuid IN (<jobUuid>)\n"
9295
+ " ORDER BY r.job_name, r.namespace_name, created_at DESC\n"
9396
+ ")\n"
9497
+ "SELECT r.*, ra.args, ctx.context, f.facets,\n"

api/src/main/java/marquez/db/SearchDao.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,13 @@ public interface SearchDao {
3333
+ " FROM datasets AS d\n"
3434
+ " WHERE d.name ilike '%' || :query || '%'\n"
3535
+ " UNION\n"
36-
+ " SELECT 'JOB' AS type, j.name, j.updated_at, j.namespace_name\n"
37-
+ " FROM jobs_view AS j\n"
36+
+ " SELECT DISTINCT ON (j.namespace_name, j.name) \n"
37+
+ " 'JOB' AS type, j.name, j.updated_at, j.namespace_name\n"
38+
+ " FROM (SELECT namespace_name, name, unnest(aliases) AS alias, updated_at \n"
39+
+ " FROM jobs_view WHERE symlink_target_uuid IS NULL\n"
40+
+ " ORDER BY updated_at DESC) AS j\n"
3841
+ " WHERE j.name ilike '%' || :query || '%'\n"
42+
+ " OR j.alias ilike '%' || :query || '%'\n"
3943
+ ") AS results\n"
4044
+ "WHERE type = :filter OR CAST(:filter AS TEXT) IS NULL\n"
4145
+ "ORDER BY :sort\n"

api/src/main/resources/marquez/db/migration/R__1_Jobs_view_and_rewrite_function.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ BEGIN
111111
INNER JOIN fqn jf ON jf.uuid = COALESCE(js.link_target_uuid, j.uuid)
112112
ON CONFLICT (uuid) DO UPDATE
113113
SET job_fqn=EXCLUDED.job_fqn,
114-
aliases = jobs_fqn.aliases || EXCLUDED.aliases;
114+
aliases = (SELECT array_agg(DISTINCT a) FROM (SELECT unnest(jobs_fqn.aliases) AS a UNION SELECT unnest(EXCLUDED.aliases) AS a) al);
115115
END IF;
116116
SELECT * INTO inserted_job FROM jobs_view WHERE uuid=job_uuid;
117117
return inserted_job;

api/src/test/java/marquez/db/LineageDaoTest.java

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import static org.assertj.core.api.Assertions.assertThat;
1515

1616
import com.google.common.base.Functions;
17+
import java.sql.SQLException;
1718
import java.util.Arrays;
1819
import java.util.Collections;
1920
import java.util.HashSet;
@@ -25,10 +26,13 @@
2526
import java.util.UUID;
2627
import java.util.stream.Collectors;
2728
import java.util.stream.Stream;
29+
import marquez.common.models.JobType;
2830
import marquez.db.LineageTestUtils.DatasetConsumerJob;
2931
import marquez.db.LineageTestUtils.JobLineage;
3032
import marquez.db.models.DatasetData;
3133
import marquez.db.models.JobData;
34+
import marquez.db.models.JobRow;
35+
import marquez.db.models.NamespaceRow;
3236
import marquez.db.models.UpdateLineageRow;
3337
import marquez.jdbi.MarquezJdbiExternalPostgresExtension;
3438
import marquez.service.models.LineageEvent;
@@ -44,6 +48,7 @@
4448
import org.junit.jupiter.api.BeforeAll;
4549
import org.junit.jupiter.api.Test;
4650
import org.junit.jupiter.api.extension.ExtendWith;
51+
import org.postgresql.util.PGobject;
4752

4853
@ExtendWith(MarquezJdbiExternalPostgresExtension.class)
4954
public class LineageDaoTest {
@@ -177,6 +182,102 @@ public void testGetLineage() {
177182
}
178183
}
179184

185+
@Test
186+
public void testGetLineageForSymlinkedJob() throws SQLException {
187+
188+
UpdateLineageRow writeJob =
189+
LineageTestUtils.createLineageRow(
190+
openLineageDao,
191+
"writeJob",
192+
"COMPLETE",
193+
jobFacet,
194+
Arrays.asList(),
195+
Arrays.asList(dataset));
196+
List<JobLineage> jobRows =
197+
writeDownstreamLineage(
198+
openLineageDao,
199+
new LinkedList<>(
200+
Arrays.asList(
201+
new DatasetConsumerJob("readJob", 20, Optional.of("outputData")),
202+
new DatasetConsumerJob("downstreamJob", 1, Optional.empty()))),
203+
jobFacet,
204+
dataset);
205+
206+
NamespaceRow namespaceRow =
207+
jdbi.onDemand(NamespaceDao.class)
208+
.findNamespaceByName(writeJob.getJob().getNamespaceName())
209+
.get();
210+
211+
PGobject inputs = new PGobject();
212+
inputs.setType("json");
213+
inputs.setValue("[]");
214+
215+
String symlinkTargetJobName = "A_new_write_job";
216+
JobRow targetJob =
217+
jdbi.onDemand(JobDao.class)
218+
.upsertJob(
219+
UUID.randomUUID(),
220+
JobType.valueOf(writeJob.getJob().getType()),
221+
writeJob.getJob().getCreatedAt(),
222+
namespaceRow.getUuid(),
223+
writeJob.getJob().getNamespaceName(),
224+
symlinkTargetJobName,
225+
writeJob.getJob().getDescription().orElse(null),
226+
writeJob.getJob().getJobContextUuid().orElse(null),
227+
writeJob.getJob().getLocation(),
228+
null,
229+
inputs);
230+
jdbi.onDemand(JobDao.class)
231+
.upsertJob(
232+
writeJob.getJob().getUuid(),
233+
JobType.valueOf(writeJob.getJob().getType()),
234+
writeJob.getJob().getCreatedAt(),
235+
namespaceRow.getUuid(),
236+
writeJob.getJob().getNamespaceName(),
237+
writeJob.getJob().getName(),
238+
writeJob.getJob().getDescription().orElse(null),
239+
writeJob.getJob().getJobContextUuid().orElse(null),
240+
writeJob.getJob().getLocation(),
241+
targetJob.getUuid(),
242+
inputs);
243+
244+
// fetch the first "targetJob" lineage.
245+
Set<JobData> connectedJobs =
246+
lineageDao.getLineage(new HashSet<>(Arrays.asList(targetJob.getUuid())), 2);
247+
248+
// 20 readJobs + 1 downstreamJob for each (20) + 1 write job = 41
249+
assertThat(connectedJobs).size().isEqualTo(41);
250+
251+
Set<UUID> jobIds = connectedJobs.stream().map(JobData::getUuid).collect(Collectors.toSet());
252+
// expect the job that wrote "commonDataset", which is readJob0's input
253+
assertThat(jobIds).contains(targetJob.getUuid());
254+
255+
// expect all downstream jobs
256+
Set<UUID> readJobUUIDs =
257+
jobRows.stream()
258+
.flatMap(row -> Stream.concat(Stream.of(row), row.getDownstreamJobs().stream()))
259+
.map(JobLineage::getId)
260+
.collect(Collectors.toSet());
261+
assertThat(jobIds).containsAll(readJobUUIDs);
262+
263+
Map<UUID, JobData> actualJobRows =
264+
connectedJobs.stream().collect(Collectors.toMap(JobData::getUuid, Functions.identity()));
265+
for (JobLineage expected : jobRows) {
266+
JobData job = actualJobRows.get(expected.getId());
267+
assertThat(job.getInputUuids())
268+
.containsAll(
269+
expected.getInput().map(ds -> ds.getDatasetRow().getUuid()).stream()::iterator);
270+
assertThat(job.getOutputUuids())
271+
.containsAll(
272+
expected.getOutput().map(ds -> ds.getDatasetRow().getUuid()).stream()::iterator);
273+
}
274+
Set<UUID> lineageForOriginalJob =
275+
lineageDao.getLineage(new HashSet<>(Arrays.asList(writeJob.getJob().getUuid())), 2).stream()
276+
.map(JobData::getUuid)
277+
.collect(Collectors.toSet());
278+
assertThat(lineageForOriginalJob).isEqualTo(jobIds);
279+
}
280+
180281
@Test
181282
public void testGetLineageWithJobThatHasNoDownstreamConsumers() {
182283

api/src/test/java/marquez/db/SearchDaoTest.java

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77

88
import static org.assertj.core.api.Assertions.assertThat;
99

10+
import com.google.common.collect.ImmutableMap;
11+
import com.google.common.collect.ImmutableSet;
12+
import java.net.URL;
13+
import java.sql.SQLException;
1014
import java.time.Instant;
1115
import java.util.List;
1216
import java.util.Map;
@@ -15,17 +19,24 @@
1519
import marquez.api.models.SearchFilter;
1620
import marquez.api.models.SearchResult;
1721
import marquez.api.models.SearchSort;
22+
import marquez.common.Utils;
23+
import marquez.common.models.JobType;
24+
import marquez.db.models.JobRow;
25+
import marquez.db.models.NamespaceRow;
1826
import marquez.jdbi.MarquezJdbiExternalPostgresExtension;
27+
import marquez.service.models.JobMeta;
1928
import org.jdbi.v3.core.Jdbi;
2029
import org.junit.jupiter.api.BeforeAll;
2130
import org.junit.jupiter.api.Tag;
2231
import org.junit.jupiter.api.Test;
2332
import org.junit.jupiter.api.extension.ExtendWith;
33+
import org.postgresql.util.PGobject;
2434

2535
/** The test suite for {@link SearchDao}. */
2636
@Tag("DataAccessTests")
2737
@ExtendWith(MarquezJdbiExternalPostgresExtension.class)
2838
public class SearchDaoTest {
39+
2940
static final int LIMIT = 25;
3041
static final int NUM_OF_JOBS = 2;
3142
/**
@@ -34,10 +45,12 @@ public class SearchDaoTest {
3445
*/
3546
static final int NUM_OF_DATASETS = 12;
3647

48+
public static final String NEW_SYMLINK_TARGET_JOB = "a_new_symlink_target_job";
49+
3750
static SearchDao searchDao;
3851

3952
@BeforeAll
40-
public static void setUpOnce(final Jdbi jdbi) {
53+
public static void setUpOnce(final Jdbi jdbi) throws SQLException {
4154
searchDao = jdbi.onDemand(SearchDao.class);
4255

4356
DbTestUtils.newDataset(jdbi, "name_ordering_0");
@@ -48,7 +61,51 @@ public static void setUpOnce(final Jdbi jdbi) {
4861
DbTestUtils.newDataset(jdbi, "time_ordering_1");
4962
DbTestUtils.newDataset(jdbi, "time_ordering_2");
5063

51-
DbTestUtils.newJobs(jdbi, NUM_OF_JOBS);
64+
ImmutableSet<JobRow> jobRows = DbTestUtils.newJobs(jdbi, NUM_OF_JOBS);
65+
66+
// add a symlinked job - validate that the number of results is the same in the below unit test
67+
jobRows.stream()
68+
.findAny()
69+
.ifPresent(
70+
j -> {
71+
try {
72+
NamespaceRow namespaceRow =
73+
jdbi.onDemand(NamespaceDao.class)
74+
.findNamespaceByName(j.getNamespaceName())
75+
.get();
76+
JobRow symlinkTargetJob =
77+
DbTestUtils.newJobWith(
78+
jdbi,
79+
namespaceRow.getName(),
80+
NEW_SYMLINK_TARGET_JOB,
81+
new JobMeta(
82+
JobType.valueOf(j.getType()),
83+
ImmutableSet.copyOf(j.getInputs()),
84+
ImmutableSet.of(),
85+
new URL(j.getLocation()),
86+
ImmutableMap.of(),
87+
j.getDescription().orElse(null),
88+
null));
89+
PGobject inputs = new PGobject();
90+
inputs.setType("json");
91+
inputs.setValue(Utils.getMapper().writeValueAsString(j.getInputs()));
92+
jdbi.onDemand(JobDao.class)
93+
.upsertJob(
94+
j.getUuid(),
95+
JobType.valueOf(j.getType()),
96+
j.getCreatedAt(),
97+
namespaceRow.getUuid(),
98+
namespaceRow.getName(),
99+
j.getName(),
100+
j.getDescription().orElse(null),
101+
j.getJobContextUuid().orElse(null),
102+
j.getLocation(),
103+
symlinkTargetJob.getUuid(),
104+
inputs);
105+
} catch (Exception e) {
106+
throw new RuntimeException(e);
107+
}
108+
});
52109
}
53110

54111
@Test
@@ -72,6 +129,12 @@ public void testSearch() {
72129
final List<SearchResult> resultsWithOnlyJobs =
73130
resultsGroupedByType.get(SearchResult.ResultType.JOB);
74131
assertThat(resultsWithOnlyJobs).hasSize(NUM_OF_JOBS);
132+
133+
// Even though we searched for "test" and the symlink target doesn't have "test" in its name,
134+
// it is part of the search results because the original job had "test" in its name.
135+
assertThat(resultsWithOnlyJobs)
136+
.filteredOn(j -> j.getName().equals(NEW_SYMLINK_TARGET_JOB))
137+
.isNotEmpty();
75138
}
76139

77140
@Test

0 commit comments

Comments
 (0)