Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,5 @@ plugins-prod
/test-sched
/test-module
/results
/x/*
/x/*
mise.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,25 @@ class GoogleBatchMachineTypeSelector {
*/
private static final List<String> ACCELERATOR_OPTIMIZED_FAMILIES = ['a2-*', 'a3-*', 'g2-*']

/*
* Families that only support Hyperdisk disk types (not pd-standard, pd-balanced, pd-ssd).
* These require 'hyperdisk-*' as boot disk type.
* https://docs.cloud.google.com/compute/docs/general-purpose-machines?hl=en#supported_disk_types_for_c4
*/
private static final List<String> HYPERDISK_ONLY_FAMILIES = ['c4-*', 'c4a-*', 'c4d-*', 'n4-*', 'n4a-*', 'n4d-*', 'z3-*']
/*
* Families that do not support Local SSD
*/
private static final List<String> PD_ONLY_FAMILIES = ['e2-*']
/*
* Families that do not support Local SSD
*/
private static final List<String> NO_LOCAL_SSD_SUPPORT_FAMILIES = ['e2-*', 'h3-*', 'm2-*', 'm4-*', 'n4-*', 't2a-*', 't2d-*', 'x4-*']
/*
* Families that support local SSD with 'lssd' suffix
*/
private static final List<String> PARTIAL_LOCAL_SSD_SUPPORT_FAMILIES = ['c3-*', 'c3a-*', 'c3d-*', 'c4-*', 'c4a-*', 'c4d-*', 'h4d-*', 'z3-*']

@Immutable
static class MachineType {
String type
Expand Down Expand Up @@ -122,11 +141,13 @@ class GoogleBatchMachineTypeSelector {
final matchMachineType = {String type -> !families || families.find { matchType(it, type) }}

// find machines with enough resources and SSD local disk
final validMachineTypes = getAvailableMachineTypes(region, spot).findAll {
def validMachineTypes = getAvailableMachineTypes(region, spot).findAll {
it.cpusPerVm >= cpus &&
it.memPerVm >= memoryGB &&
matchMachineType(it.type)
}.collect()
if (fusionEnabled)
validMachineTypes = validMachineTypes.findAll { hasLocalSsd(it.type)}.collect()

final sortedByCost = validMachineTypes.sort {
(it.cpusPerVm > 2 || it.memPerVm > 2 ? FAMILY_COST_CORRECTION.get(it.family, 1.0) : 1.0) * (spot ? it.spotPrice : it.onDemandPrice)
Expand All @@ -135,7 +156,7 @@ class GoogleBatchMachineTypeSelector {
return sortedByCost.first()
}

protected boolean matchType(String family, String vmType) {
protected static boolean matchType(String family, String vmType) {
if (!family)
return true
if (family.contains('*'))
Expand Down Expand Up @@ -255,6 +276,7 @@ class GoogleBatchMachineTypeSelector {

// These families have a local SSD already attached and is not configurable.
if( ((machineType.family == "c3" || machineType.family == "c3d") && machineType.type.endsWith("-lssd")) ||
((machineType.family == "c4" || machineType.family == "c4a" || machineType.family == "c4d") && machineType.type.endsWith("-lssd")) ||
Comment thread
jorgee marked this conversation as resolved.
Outdated
machineType.family == "a3" ||
machineType.type.startsWith("a2-ultragpu-") )
return new MemoryUnit( 0 )
Expand Down Expand Up @@ -287,6 +309,50 @@ class GoogleBatchMachineTypeSelector {
return new MemoryUnit( numberOfDisks * 375L * (1<<30) )
}

/**
* Check if the machine type belongs to a family that only supports Hyperdisk.
*
* @param machineType Machine type
* @return Boolean value indicating if the machine type requires Hyperdisk.
*/
static boolean isHyperdiskOnly(String machineType) {
return HYPERDISK_ONLY_FAMILIES.any { matchType(it, machineType) }
}

/**
* Check if the machine type belongs to a family that only supports pd-* disk.
*
* @param machineType Machine type
* @return Boolean value indicating if the machine type requires pd-* disk type.
*/
static boolean isPdOnly(String machineType) {
return PD_ONLY_FAMILIES.any { matchType(it, machineType) }
}

/**
* Check if the machine type allow to have a local-ssd .
*
* @param machineType Machine type
* @return Boolean value indicating if the machine type can have local ssd disks.
*/
static boolean hasLocalSsd(String machineType) {
if( machineType.contains('lssd') )
return true

if( (PARTIAL_LOCAL_SSD_SUPPORT_FAMILIES + NO_LOCAL_SSD_SUPPORT_FAMILIES).any { matchType(it, machineType) } )
return false
Comment thread
bentsherman marked this conversation as resolved.
Outdated

return true
}
/**
* Check if a machine type doesn't support
* @param machineTypeOrFamily
* @return
*/
static boolean unsupportedLocalSSD(String machineTypeOrFamily) {
return NO_LOCAL_SSD_SUPPORT_FAMILIES.any { matchType(it, machineTypeOrFamily) }
}

/**
* Determine whether GPU drivers should be installed.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -401,11 +401,13 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
else {
final instancePolicy = AllocationPolicy.InstancePolicy.newBuilder()

if( batchConfig.getBootDiskImage() )
instancePolicy.setBootDisk(AllocationPolicy.Disk.newBuilder().setImage(batchConfig.getBootDiskImage()))

if( fusionEnabled() && !disk ) {
disk = new DiskResource(request: '375 GB', type: 'local-ssd')
final reqMachineType = task.config.getMachineType()
if ( reqMachineType ) {
disk = new DiskResource(request: '375 GB', type: chooseFusionDiskType(reqMachineType) )
} else {
disk = new DiskResource(request: '375 GB', type: 'local-ssd')
}
Comment thread
jorgee marked this conversation as resolved.
Outdated
log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - adding local volume as fusion scratch: $disk"
}

Expand All @@ -423,6 +425,20 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
)
}

// Configure boot disk
Comment thread
jorgee marked this conversation as resolved.
final bootDisk = AllocationPolicy.Disk.newBuilder()
boolean setBoot = false
if( batchConfig.getBootDiskImage() ) {
bootDisk.setImage(batchConfig.getBootDiskImage())
setBoot = true
}
if( machineType && GoogleBatchMachineTypeSelector.INSTANCE.isHyperdiskOnly(machineType.type) ) {
bootDisk.setType('hyperdisk-balanced')
setBoot = true
}
if( setBoot )
instancePolicy.setBootDisk(bootDisk)

if( task.config.getAccelerator() ) {
final accelerator = AllocationPolicy.Accelerator.newBuilder()
.setCount(task.config.getAccelerator().getRequest())
Expand Down Expand Up @@ -482,6 +498,22 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
return new InstancePolicyResult(instancePolicyOrTemplate.build(), requiresScratchVolume)
}

/**
* Choose the disk type for Fusion according to the machine or family.
* Preference is 'local-ssd', 'hyperdisk-balanced' and 'pd-balanced' other types can be set by setting disk directive
* @param machineTypeOrFamily
* @return Disk type
*/
protected String chooseFusionDiskType(String machineTypeOrFamily){
if( !GoogleBatchMachineTypeSelector.unsupportedLocalSSD(machineTypeOrFamily) ){
return 'local-ssd'
} else if( GoogleBatchMachineTypeSelector.isPdOnly(machineTypeOrFamily) ){
return 'pd-balanced'
} else {
return 'hyperdisk-balanced'
}
}

/**
* Build the allocation policy for the job
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
new MachineType(type: 'm2-type08', family: 'm2', 'spotPrice': 0.036, 'onDemandPrice': 0.35, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'n2-type09', family: 'n2', 'spotPrice': 0.040, 'onDemandPrice': 0.40, 'cpusPerVm': 10, 'memPerVm': 10),
new MachineType(type: 'c2-type10', family: 'c2', 'spotPrice': 0.045, 'onDemandPrice': 0.45, 'cpusPerVm': 10, 'memPerVm': 10),
new MachineType(type: 'c4-type11', family: 'c4', 'spotPrice': 0.040, 'onDemandPrice': 0.40, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'c4a-type12', family: 'c4a', 'spotPrice': 0.038, 'onDemandPrice': 0.38, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'c4d-type13', family: 'c4d', 'spotPrice': 0.039, 'onDemandPrice': 0.39, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'n4-type14', family: 'n4', 'spotPrice': 0.035, 'onDemandPrice': 0.35, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'n4a-type15', family: 'n4a', 'spotPrice': 0.033, 'onDemandPrice': 0.33, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'n4d-type16', family: 'n4d', 'spotPrice': 0.034, 'onDemandPrice': 0.34, 'cpusPerVm': 8, 'memPerVm': 8),
]

def 'should select best machine type'() {
Expand All @@ -57,6 +63,12 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
8 | 8000 | 'reg' | true | false | null | 'm1-type07'
8 | 8000 | 'reg' | false | false | ['m?-*', 'c2-*'] | 'm2-type08'
8 | 8000 | 'reg' | false | false | ['m1-type07', 'm2-type66'] | 'm1-type07'
8 | 8000 | 'reg' | true | false | ['c4-*'] | 'c4-type11'
8 | 8000 | 'reg' | true | false | ['c4a-*'] | 'c4a-type12'
8 | 8000 | 'reg' | true | false | ['c4d-*'] | 'c4d-type13'
8 | 8000 | 'reg' | true | false | ['n4-*'] | 'n4-type14'
8 | 8000 | 'reg' | true | false | ['n4a-*'] | 'n4a-type15'
8 | 8000 | 'reg' | true | false | ['n4d-*'] | 'n4d-type16'


}
Expand Down Expand Up @@ -113,6 +125,27 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
'200 GB' | 'c2-standard-4' | 'c2' | 4 | '375 GB'
'50 GB' | 'c2d-highmem-56' | 'c2d' | 56 | '1500 GB'
'750 GB' | 'm3-megamem-64' | 'm3' | 64 | '1500 GB'
'100 GB' | 'c4-standard-8-lssd' | 'c4' | 8 | '0'
'100 GB' | 'c4a-standard-8-lssd' | 'c4a' | 8 | '0'
'100 GB' | 'c4d-standard-8-lssd' | 'c4d' | 8 | '0'
}

def 'should know when hyperdisk is required'() {
expect:
GoogleBatchMachineTypeSelector.INSTANCE.isHyperdiskOnly(TYPE) == EXPECTED

where:
TYPE | EXPECTED
'c4-standard-8' | true
'c4a-standard-8' | true
'c4d-standard-8' | true
'n4-standard-8' | true
'n4a-standard-8' | true
'n4d-standard-8' | true
'n1-standard-8' | false
'n2-standard-8' | false
'e2-standard-8' | false
'c2-standard-8' | false
}

def 'should know when to install GPU drivers'() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,84 @@ class GoogleBatchTaskHandlerTest extends Specification {
result == null
}

def 'should set hyperdisk-balanced boot disk for hyperdisk-only machine families' () {
given:
def WORK_DIR = CloudStorageFileSystem.forBucket('foo').getPath('/scratch')
def CONTAINER_IMAGE = 'debian:latest'
def MACHINE_TYPE = 'c4-standard-8'
def exec = Mock(GoogleBatchExecutor) {
getBatchConfig() >> Mock(BatchConfig)
}
and:
def bean = new TaskBean(workDir: WORK_DIR, inputFiles: [:])
def task = Mock(TaskRun) {
toTaskBean() >> bean
getHashLog() >> 'abcd1234'
getWorkDir() >> WORK_DIR
getContainer() >> CONTAINER_IMAGE
getConfig() >> Mock(TaskConfig) {
getCpus() >> 8
getResourceLabels() >> [:]
}
}
and:
def mounts = ['/mnt/disks/foo/scratch:/mnt/disks/foo/scratch:rw']
def launcher = new GoogleBatchLauncherSpecMock('bash .command.run', mounts, [])

and:
def handler = Spy(new GoogleBatchTaskHandler(task, exec))

when:
def req = handler.newSubmitRequest(task, launcher)
then:
handler.fusionEnabled() >> false
handler.findBestMachineType(_, false) >> new GoogleBatchMachineTypeSelector.MachineType(type: MACHINE_TYPE, family: 'c4', location: 'us-central1', priceModel: PriceModel.standard)

and:
def instancePolicy = req.getAllocationPolicy().getInstances(0).getPolicy()
instancePolicy.getMachineType() == MACHINE_TYPE
instancePolicy.getBootDisk().getType() == 'hyperdisk-balanced'
}

def 'should not set hyperdisk boot disk for standard machine families' () {
given:
def WORK_DIR = CloudStorageFileSystem.forBucket('foo').getPath('/scratch')
def CONTAINER_IMAGE = 'debian:latest'
def MACHINE_TYPE = 'n2-standard-8'
def exec = Mock(GoogleBatchExecutor) {
getBatchConfig() >> Mock(BatchConfig)
}
and:
def bean = new TaskBean(workDir: WORK_DIR, inputFiles: [:])
def task = Mock(TaskRun) {
toTaskBean() >> bean
getHashLog() >> 'abcd1234'
getWorkDir() >> WORK_DIR
getContainer() >> CONTAINER_IMAGE
getConfig() >> Mock(TaskConfig) {
getCpus() >> 8
getResourceLabels() >> [:]
}
}
and:
def mounts = ['/mnt/disks/foo/scratch:/mnt/disks/foo/scratch:rw']
def launcher = new GoogleBatchLauncherSpecMock('bash .command.run', mounts, [])

and:
def handler = Spy(new GoogleBatchTaskHandler(task, exec))

when:
def req = handler.newSubmitRequest(task, launcher)
then:
handler.fusionEnabled() >> false
handler.findBestMachineType(_, false) >> new GoogleBatchMachineTypeSelector.MachineType(type: MACHINE_TYPE, family: 'n2', location: 'us-central1', priceModel: PriceModel.standard)

and:
def instancePolicy = req.getAllocationPolicy().getInstances(0).getPolicy()
instancePolicy.getMachineType() == MACHINE_TYPE
instancePolicy.getBootDisk().getType() == ''
}

// ==========================================================================
// Tests for extracted helper methods
// ==========================================================================
Expand Down Expand Up @@ -1248,4 +1326,53 @@ class GoogleBatchTaskHandlerTest extends Specification {
result.getContainer().getOptions() == '--privileged'
}

def 'should choose fusion disk type'() {
given:
def exec = Mock(GoogleBatchExecutor) {
getBatchConfig() >> Mock(BatchConfig)
}
def task = Mock(TaskRun) {
getHashLog() >> 'abcd1234'
getWorkDir() >> CloudStorageFileSystem.forBucket('foo').getPath('/scratch')
getContainer() >> 'ubuntu:latest'
getConfig() >> Mock(TaskConfig)
}
def handler = new GoogleBatchTaskHandler(task, exec)

expect:
handler.chooseFusionDiskType(machineType) == expected

where:
machineType | expected
// exact machine types
'n2-standard-4' | 'local-ssd' // standard family: supports local SSD
'c2-standard-8' | 'local-ssd' // compute-optimized: supports local SSD
'n1-standard-1' | 'local-ssd' // legacy N1: supports local SSD
'e2-standard-4' | 'pd-balanced' // E2: no local SSD, pd-only
'e2-micro' | 'pd-balanced' // E2 small: no local SSD, pd-only
'h3-standard-88' | 'hyperdisk-balanced' // H3: no local SSD, not pd-only
'n4-standard-4' | 'hyperdisk-balanced' // N4: no local SSD, not pd-only
't2a-standard-4' | 'hyperdisk-balanced' // T2A: no local SSD, not pd-only
't2d-standard-4' | 'hyperdisk-balanced' // T2D: no local SSD, not pd-only
'm2-ultramem-208' | 'hyperdisk-balanced' // M2: no local SSD, not pd-only
'm4-megamem-416' | 'hyperdisk-balanced' // M4: no local SSD, not pd-only
'x4-megamem-1440' | 'hyperdisk-balanced' // X4: no local SSD, not pd-only
// glob families with * (family glob passed directly)
'n2-*' | 'local-ssd' // N2 family glob: supports local SSD
'c3-*' | 'local-ssd' // C3 family glob: partial lssd support, not in NO_LOCAL_SSD
'a2-*' | 'local-ssd' // A2 accelerator family glob: supports local SSD
'e2-*' | 'pd-balanced' // E2 family glob: no local SSD, pd-only
'h3-*' | 'hyperdisk-balanced' // H3 family glob: no local SSD, not pd-only
'n4-*' | 'hyperdisk-balanced' // N4 family glob: no local SSD, not pd-only
't2a-*' | 'hyperdisk-balanced' // T2A family glob: no local SSD, not pd-only
'm2-*' | 'hyperdisk-balanced' // M2 family glob: no local SSD, not pd-only
// glob patterns with ? in the type suffix (? is matched literally by .* in family regex)
'n2-standard-?' | 'local-ssd' // n2 prefix still not in NO_LOCAL_SSD
'e2-standard-?' | 'pd-balanced' // e2 prefix matches e2-* → pd-only
'h3-standard-??' | 'hyperdisk-balanced' // h3 prefix matches h3-* → no local SSD, not pd-only
// ? replacing the family discriminator digit is treated as a literal character (not a wildcard)
'e?-standard-4' | 'local-ssd' // 'e?' does not match regex ^e2-.*$, so not classified as e2
'h?-standard-88' | 'local-ssd' // 'h?' does not match regex ^h3-.*$, so not classified as h3
}

}
Loading