Skip to content

Commit 27785b1

Browse files
authored
Add support for latest-generation Google Cloud machine families (#6841)
1 parent 903caea commit 27785b1

5 files changed

Lines changed: 306 additions & 11 deletions

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,5 @@ plugins-prod
4444
/test-sched
4545
/test-module
4646
/results
47-
/x/*
47+
/x/*
48+
mise.toml

plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy

Lines changed: 79 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,25 @@ class GoogleBatchMachineTypeSelector {
8282
*/
8383
private static final List<String> ACCELERATOR_OPTIMIZED_FAMILIES = ['a2-*', 'a3-*', 'g2-*']
8484

85+
/*
86+
* Families that only support Hyperdisk disk types (not pd-standard, pd-balanced, pd-ssd).
87+
* These require 'hyperdisk-*' as boot disk type.
88+
* https://docs.cloud.google.com/compute/docs/general-purpose-machines?hl=en#supported_disk_types_for_c4
89+
*/
90+
private static final List<String> HYPERDISK_ONLY_FAMILIES = ['c4-*', 'c4a-*', 'c4d-*', 'n4-*', 'n4a-*', 'n4d-*', 'z3-*']
91+
/*
92+
* Families that do not support Local SSD
93+
*/
94+
private static final List<String> PD_ONLY_FAMILIES = ['e2-*']
95+
/*
96+
* Families that do not support Local SSD
97+
*/
98+
private static final List<String> NO_LOCAL_SSD_SUPPORT_FAMILIES = ['e2-*', 'h3-*', 'm2-*', 'm4-*', 'n4-*', 't2a-*', 't2d-*', 'x4-*']
99+
/*
100+
* Families that support local SSD with 'lssd' suffix
101+
*/
102+
private static final List<String> PARTIAL_LOCAL_SSD_SUPPORT_FAMILIES = ['c3-*', 'c3a-*', 'c3d-*', 'c4-*', 'c4a-*', 'c4d-*', 'h4d-*', 'z3-*']
103+
85104
@Immutable
86105
static class MachineType {
87106
String type
@@ -122,11 +141,13 @@ class GoogleBatchMachineTypeSelector {
122141
final matchMachineType = {String type -> !families || families.find { matchType(it, type) }}
123142

124143
// find machines with enough resources and SSD local disk
125-
final validMachineTypes = getAvailableMachineTypes(region, spot).findAll {
144+
def validMachineTypes = getAvailableMachineTypes(region, spot).findAll {
126145
it.cpusPerVm >= cpus &&
127146
it.memPerVm >= memoryGB &&
128147
matchMachineType(it.type)
129148
}.collect()
149+
if (fusionEnabled)
150+
validMachineTypes = validMachineTypes.findAll { hasLocalSsd(it.type)}.collect()
130151

131152
final sortedByCost = validMachineTypes.sort {
132153
(it.cpusPerVm > 2 || it.memPerVm > 2 ? FAMILY_COST_CORRECTION.get(it.family, 1.0) : 1.0) * (spot ? it.spotPrice : it.onDemandPrice)
@@ -135,7 +156,7 @@ class GoogleBatchMachineTypeSelector {
135156
return sortedByCost.first()
136157
}
137158

138-
protected boolean matchType(String family, String vmType) {
159+
protected static boolean matchType(String family, String vmType) {
139160
if (!family)
140161
return true
141162
if (family.contains('*'))
@@ -253,17 +274,22 @@ class GoogleBatchMachineTypeSelector {
253274
return findFirstValidSize(requested, [8])
254275
}
255276

256-
// These families have a local SSD already attached and is not configurable.
257-
if( ((machineType.family == "c3" || machineType.family == "c3d") && machineType.type.endsWith("-lssd")) ||
258-
machineType.family == "a3" ||
259-
machineType.type.startsWith("a2-ultragpu-") )
277+
if( notConfigurableLocalSSD(machineType) )
260278
return new MemoryUnit( 0 )
261279

262280
// For other special families, the user must provide a valid size. If a family does not
263281
// support local disks, then Google Batch shall return an appropriate error.
264282
return requested
265283
}
266284

285+
protected notConfigurableLocalSSD(MachineType machineType) {
286+
// These families have a local SSD already attached and is not configurable.
287+
return ((machineType.family == "c3" || machineType.family == "c3d") && machineType.type.endsWith("-lssd")) ||
288+
((machineType.family == "c4" || machineType.family == "c4a" || machineType.family == "c4d") && machineType.type.endsWith("-lssd")) ||
289+
machineType.family == "a3" ||
290+
machineType.type.startsWith("a2-ultragpu-")
291+
}
292+
267293
/**
268294
* Find first valid disk size given the possible mounted partition
269295
*
@@ -287,6 +313,53 @@ class GoogleBatchMachineTypeSelector {
287313
return new MemoryUnit( numberOfDisks * 375L * (1<<30) )
288314
}
289315

316+
/**
317+
* Check if the machine type belongs to a family that only supports Hyperdisk.
318+
*
319+
* @param machineType Machine type
320+
* @return Boolean value indicating if the machine type requires Hyperdisk.
321+
*/
322+
static boolean isHyperdiskOnly(String machineType) {
323+
return HYPERDISK_ONLY_FAMILIES.any { matchType(it, machineType) }
324+
}
325+
326+
/**
327+
* Check if the machine type belongs to a family that only supports pd-* disk.
328+
*
329+
* @param machineType Machine type
330+
* @return Boolean value indicating if the machine type requires pd-* disk type.
331+
*/
332+
static boolean isPdOnly(String machineType) {
333+
return PD_ONLY_FAMILIES.any { matchType(it, machineType) }
334+
}
335+
336+
/**
337+
* Check if the machine type allow to have a local-ssd .
338+
*
339+
* @param machineType Machine type
340+
* @return Boolean value indicating if the machine type can have local ssd disks.
341+
*/
342+
static boolean hasLocalSsd(String machineType) {
343+
if( machineType.contains('lssd') )
344+
return true
345+
346+
if( PARTIAL_LOCAL_SSD_SUPPORT_FAMILIES.any { matchType(it, machineType) } )
347+
return false
348+
349+
if( NO_LOCAL_SSD_SUPPORT_FAMILIES.any { matchType(it, machineType) } )
350+
return false
351+
352+
return true
353+
}
354+
/**
355+
* Check if a machine type doesn't support
356+
* @param machineTypeOrFamily
357+
* @return
358+
*/
359+
static boolean unsupportedLocalSSD(String machineTypeOrFamily) {
360+
return NO_LOCAL_SSD_SUPPORT_FAMILIES.any { matchType(it, machineTypeOrFamily) }
361+
}
362+
290363
/**
291364
* Determine whether GPU drivers should be installed.
292365
*

plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -401,11 +401,12 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
401401
else {
402402
final instancePolicy = AllocationPolicy.InstancePolicy.newBuilder()
403403

404-
if( batchConfig.getBootDiskImage() )
405-
instancePolicy.setBootDisk(AllocationPolicy.Disk.newBuilder().setImage(batchConfig.getBootDiskImage()))
406-
407404
if( fusionEnabled() && !disk ) {
408-
disk = new DiskResource(request: '375 GB', type: 'local-ssd')
405+
final reqMachineType = task.config.getMachineType()
406+
disk = new DiskResource(
407+
request: '375 GB',
408+
type: reqMachineType ? chooseFusionDiskType(reqMachineType) : 'local-ssd'
409+
)
409410
log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - adding local volume as fusion scratch: $disk"
410411
}
411412

@@ -423,6 +424,20 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
423424
)
424425
}
425426

427+
// Configure boot disk
428+
final bootDisk = AllocationPolicy.Disk.newBuilder()
429+
boolean setBoot = false
430+
if( batchConfig.getBootDiskImage() ) {
431+
bootDisk.setImage(batchConfig.getBootDiskImage())
432+
setBoot = true
433+
}
434+
if( machineType && GoogleBatchMachineTypeSelector.INSTANCE.isHyperdiskOnly(machineType.type) ) {
435+
bootDisk.setType('hyperdisk-balanced')
436+
setBoot = true
437+
}
438+
if( setBoot )
439+
instancePolicy.setBootDisk(bootDisk)
440+
426441
if( task.config.getAccelerator() ) {
427442
final accelerator = AllocationPolicy.Accelerator.newBuilder()
428443
.setCount(task.config.getAccelerator().getRequest())
@@ -482,6 +497,22 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
482497
return new InstancePolicyResult(instancePolicyOrTemplate.build(), requiresScratchVolume)
483498
}
484499

500+
/**
501+
* Choose the disk type for Fusion according to the machine or family.
502+
* Preference is 'local-ssd', 'hyperdisk-balanced' and 'pd-balanced' other types can be set by setting disk directive
503+
* @param machineTypeOrFamily
504+
* @return Disk type
505+
*/
506+
protected String chooseFusionDiskType(String machineTypeOrFamily){
507+
if( !GoogleBatchMachineTypeSelector.unsupportedLocalSSD(machineTypeOrFamily) ){
508+
return 'local-ssd'
509+
} else if( GoogleBatchMachineTypeSelector.isPdOnly(machineTypeOrFamily) ){
510+
return 'pd-balanced'
511+
} else {
512+
return 'hyperdisk-balanced'
513+
}
514+
}
515+
485516
/**
486517
* Build the allocation policy for the job
487518
*

plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
3434
new MachineType(type: 'm2-type08', family: 'm2', 'spotPrice': 0.036, 'onDemandPrice': 0.35, 'cpusPerVm': 8, 'memPerVm': 8),
3535
new MachineType(type: 'n2-type09', family: 'n2', 'spotPrice': 0.040, 'onDemandPrice': 0.40, 'cpusPerVm': 10, 'memPerVm': 10),
3636
new MachineType(type: 'c2-type10', family: 'c2', 'spotPrice': 0.045, 'onDemandPrice': 0.45, 'cpusPerVm': 10, 'memPerVm': 10),
37+
new MachineType(type: 'c4-type11', family: 'c4', 'spotPrice': 0.040, 'onDemandPrice': 0.40, 'cpusPerVm': 8, 'memPerVm': 8),
38+
new MachineType(type: 'c4a-type12', family: 'c4a', 'spotPrice': 0.038, 'onDemandPrice': 0.38, 'cpusPerVm': 8, 'memPerVm': 8),
39+
new MachineType(type: 'c4d-type13', family: 'c4d', 'spotPrice': 0.039, 'onDemandPrice': 0.39, 'cpusPerVm': 8, 'memPerVm': 8),
40+
new MachineType(type: 'n4-type14', family: 'n4', 'spotPrice': 0.035, 'onDemandPrice': 0.35, 'cpusPerVm': 8, 'memPerVm': 8),
41+
new MachineType(type: 'n4a-type15', family: 'n4a', 'spotPrice': 0.033, 'onDemandPrice': 0.33, 'cpusPerVm': 8, 'memPerVm': 8),
42+
new MachineType(type: 'n4d-type16', family: 'n4d', 'spotPrice': 0.034, 'onDemandPrice': 0.34, 'cpusPerVm': 8, 'memPerVm': 8),
3743
]
3844

3945
def 'should select best machine type'() {
@@ -57,6 +63,12 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
5763
8 | 8000 | 'reg' | true | false | null | 'm1-type07'
5864
8 | 8000 | 'reg' | false | false | ['m?-*', 'c2-*'] | 'm2-type08'
5965
8 | 8000 | 'reg' | false | false | ['m1-type07', 'm2-type66'] | 'm1-type07'
66+
8 | 8000 | 'reg' | true | false | ['c4-*'] | 'c4-type11'
67+
8 | 8000 | 'reg' | true | false | ['c4a-*'] | 'c4a-type12'
68+
8 | 8000 | 'reg' | true | false | ['c4d-*'] | 'c4d-type13'
69+
8 | 8000 | 'reg' | true | false | ['n4-*'] | 'n4-type14'
70+
8 | 8000 | 'reg' | true | false | ['n4a-*'] | 'n4a-type15'
71+
8 | 8000 | 'reg' | true | false | ['n4d-*'] | 'n4d-type16'
6072

6173

6274
}
@@ -113,6 +125,27 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
113125
'200 GB' | 'c2-standard-4' | 'c2' | 4 | '375 GB'
114126
'50 GB' | 'c2d-highmem-56' | 'c2d' | 56 | '1500 GB'
115127
'750 GB' | 'm3-megamem-64' | 'm3' | 64 | '1500 GB'
128+
'100 GB' | 'c4-standard-8-lssd' | 'c4' | 8 | '0'
129+
'100 GB' | 'c4a-standard-8-lssd' | 'c4a' | 8 | '0'
130+
'100 GB' | 'c4d-standard-8-lssd' | 'c4d' | 8 | '0'
131+
}
132+
133+
def 'should know when hyperdisk is required'() {
134+
expect:
135+
GoogleBatchMachineTypeSelector.INSTANCE.isHyperdiskOnly(TYPE) == EXPECTED
136+
137+
where:
138+
TYPE | EXPECTED
139+
'c4-standard-8' | true
140+
'c4a-standard-8' | true
141+
'c4d-standard-8' | true
142+
'n4-standard-8' | true
143+
'n4a-standard-8' | true
144+
'n4d-standard-8' | true
145+
'n1-standard-8' | false
146+
'n2-standard-8' | false
147+
'e2-standard-8' | false
148+
'c2-standard-8' | false
116149
}
117150

118151
def 'should know when to install GPU drivers'() {
@@ -128,4 +161,34 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
128161
'a3-highgpu-1g' | 0 | true
129162
'g2-standard-4' | 0 | true
130163
}
164+
165+
def 'should detect non-configurable local SSD'() {
166+
expect:
167+
final machineType = new MachineType(type: TYPE, family: FAMILY)
168+
GoogleBatchMachineTypeSelector.INSTANCE.notConfigurableLocalSSD(machineType) == EXPECTED
169+
170+
where:
171+
TYPE | FAMILY | EXPECTED
172+
// c3/c3d with -lssd suffix → true
173+
'c3-standard-8-lssd' | 'c3' | true
174+
'c3d-standard-8-lssd' | 'c3d' | true
175+
// c4/c4a/c4d with -lssd suffix → true
176+
'c4-standard-8-lssd' | 'c4' | true
177+
'c4a-standard-8-lssd' | 'c4a' | true
178+
'c4d-standard-8-lssd' | 'c4d' | true
179+
// a3 family → always true regardless of type
180+
'a3-highgpu-8g' | 'a3' | true
181+
'a3-megagpu-64g' | 'a3' | true
182+
// a2-ultragpu- prefix → true regardless of family
183+
'a2-ultragpu-1g' | 'a2' | true
184+
'a2-ultragpu-8g' | 'a2' | true
185+
// c3/c4 without -lssd suffix → false
186+
'c3-standard-8' | 'c3' | false
187+
'c4-standard-8' | 'c4' | false
188+
// a2 non-ultragpu → false
189+
'a2-highgpu-1g' | 'a2' | false
190+
// unrelated families → false
191+
'n2-standard-4' | 'n2' | false
192+
'e2-standard-8' | 'e2' | false
193+
}
131194
}

0 commit comments

Comments
 (0)