diff --git a/.gitignore b/.gitignore index 2c5af7f5f6..e9619ea503 100644 --- a/.gitignore +++ b/.gitignore @@ -44,4 +44,5 @@ plugins-prod /test-sched /test-module /results -/x/* \ No newline at end of file +/x/* +mise.toml \ No newline at end of file diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy index 23044492b6..6bdd8bb6e9 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy @@ -82,6 +82,25 @@ class GoogleBatchMachineTypeSelector { */ private static final List ACCELERATOR_OPTIMIZED_FAMILIES = ['a2-*', 'a3-*', 'g2-*'] + /* + * Families that only support Hyperdisk disk types (not pd-standard, pd-balanced, pd-ssd). + * These require 'hyperdisk-*' as boot disk type. + * https://docs.cloud.google.com/compute/docs/general-purpose-machines?hl=en#supported_disk_types_for_c4 + */ + private static final List HYPERDISK_ONLY_FAMILIES = ['c4-*', 'c4a-*', 'c4d-*', 'n4-*', 'n4a-*', 'n4d-*', 'z3-*'] + /* + * Families that do not support Local SSD + */ + private static final List PD_ONLY_FAMILIES = ['e2-*'] + /* + * Families that do not support Local SSD + */ + private static final List NO_LOCAL_SSD_SUPPORT_FAMILIES = ['e2-*', 'h3-*', 'm2-*', 'm4-*', 'n4-*', 't2a-*', 't2d-*', 'x4-*'] + /* + * Families that support local SSD with 'lssd' suffix + */ + private static final List PARTIAL_LOCAL_SSD_SUPPORT_FAMILIES = ['c3-*', 'c3a-*', 'c3d-*', 'c4-*', 'c4a-*', 'c4d-*', 'h4d-*', 'z3-*'] + @Immutable static class MachineType { String type @@ -122,11 +141,13 @@ class GoogleBatchMachineTypeSelector { final matchMachineType = {String type -> !families || families.find { matchType(it, type) }} // find machines with enough resources and SSD local disk - final validMachineTypes = getAvailableMachineTypes(region, spot).findAll { + def validMachineTypes = getAvailableMachineTypes(region, spot).findAll { it.cpusPerVm >= cpus && it.memPerVm >= memoryGB && matchMachineType(it.type) }.collect() + if (fusionEnabled) + validMachineTypes = validMachineTypes.findAll { hasLocalSsd(it.type)}.collect() final sortedByCost = validMachineTypes.sort { (it.cpusPerVm > 2 || it.memPerVm > 2 ? FAMILY_COST_CORRECTION.get(it.family, 1.0) : 1.0) * (spot ? it.spotPrice : it.onDemandPrice) @@ -135,7 +156,7 @@ class GoogleBatchMachineTypeSelector { return sortedByCost.first() } - protected boolean matchType(String family, String vmType) { + protected static boolean matchType(String family, String vmType) { if (!family) return true if (family.contains('*')) @@ -253,10 +274,7 @@ class GoogleBatchMachineTypeSelector { return findFirstValidSize(requested, [8]) } - // These families have a local SSD already attached and is not configurable. - if( ((machineType.family == "c3" || machineType.family == "c3d") && machineType.type.endsWith("-lssd")) || - machineType.family == "a3" || - machineType.type.startsWith("a2-ultragpu-") ) + if( notConfigurableLocalSSD(machineType) ) return new MemoryUnit( 0 ) // For other special families, the user must provide a valid size. If a family does not @@ -264,6 +282,14 @@ class GoogleBatchMachineTypeSelector { return requested } + protected notConfigurableLocalSSD(MachineType machineType) { + // These families have a local SSD already attached and is not configurable. + return ((machineType.family == "c3" || machineType.family == "c3d") && machineType.type.endsWith("-lssd")) || + ((machineType.family == "c4" || machineType.family == "c4a" || machineType.family == "c4d") && machineType.type.endsWith("-lssd")) || + machineType.family == "a3" || + machineType.type.startsWith("a2-ultragpu-") + } + /** * Find first valid disk size given the possible mounted partition * @@ -287,6 +313,53 @@ class GoogleBatchMachineTypeSelector { return new MemoryUnit( numberOfDisks * 375L * (1<<30) ) } + /** + * Check if the machine type belongs to a family that only supports Hyperdisk. + * + * @param machineType Machine type + * @return Boolean value indicating if the machine type requires Hyperdisk. + */ + static boolean isHyperdiskOnly(String machineType) { + return HYPERDISK_ONLY_FAMILIES.any { matchType(it, machineType) } + } + + /** + * Check if the machine type belongs to a family that only supports pd-* disk. + * + * @param machineType Machine type + * @return Boolean value indicating if the machine type requires pd-* disk type. + */ + static boolean isPdOnly(String machineType) { + return PD_ONLY_FAMILIES.any { matchType(it, machineType) } + } + + /** + * Check if the machine type allow to have a local-ssd . + * + * @param machineType Machine type + * @return Boolean value indicating if the machine type can have local ssd disks. + */ + static boolean hasLocalSsd(String machineType) { + if( machineType.contains('lssd') ) + return true + + if( PARTIAL_LOCAL_SSD_SUPPORT_FAMILIES.any { matchType(it, machineType) } ) + return false + + if( NO_LOCAL_SSD_SUPPORT_FAMILIES.any { matchType(it, machineType) } ) + return false + + return true + } + /** + * Check if a machine type doesn't support + * @param machineTypeOrFamily + * @return + */ + static boolean unsupportedLocalSSD(String machineTypeOrFamily) { + return NO_LOCAL_SSD_SUPPORT_FAMILIES.any { matchType(it, machineTypeOrFamily) } + } + /** * Determine whether GPU drivers should be installed. * diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy index c3fd836ce7..792ec949d5 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy @@ -401,11 +401,12 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { else { final instancePolicy = AllocationPolicy.InstancePolicy.newBuilder() - if( batchConfig.getBootDiskImage() ) - instancePolicy.setBootDisk(AllocationPolicy.Disk.newBuilder().setImage(batchConfig.getBootDiskImage())) - if( fusionEnabled() && !disk ) { - disk = new DiskResource(request: '375 GB', type: 'local-ssd') + final reqMachineType = task.config.getMachineType() + disk = new DiskResource( + request: '375 GB', + type: reqMachineType ? chooseFusionDiskType(reqMachineType) : 'local-ssd' + ) log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - adding local volume as fusion scratch: $disk" } @@ -423,6 +424,20 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { ) } + // Configure boot disk + final bootDisk = AllocationPolicy.Disk.newBuilder() + boolean setBoot = false + if( batchConfig.getBootDiskImage() ) { + bootDisk.setImage(batchConfig.getBootDiskImage()) + setBoot = true + } + if( machineType && GoogleBatchMachineTypeSelector.INSTANCE.isHyperdiskOnly(machineType.type) ) { + bootDisk.setType('hyperdisk-balanced') + setBoot = true + } + if( setBoot ) + instancePolicy.setBootDisk(bootDisk) + if( task.config.getAccelerator() ) { final accelerator = AllocationPolicy.Accelerator.newBuilder() .setCount(task.config.getAccelerator().getRequest()) @@ -482,6 +497,22 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { return new InstancePolicyResult(instancePolicyOrTemplate.build(), requiresScratchVolume) } + /** + * Choose the disk type for Fusion according to the machine or family. + * Preference is 'local-ssd', 'hyperdisk-balanced' and 'pd-balanced' other types can be set by setting disk directive + * @param machineTypeOrFamily + * @return Disk type + */ + protected String chooseFusionDiskType(String machineTypeOrFamily){ + if( !GoogleBatchMachineTypeSelector.unsupportedLocalSSD(machineTypeOrFamily) ){ + return 'local-ssd' + } else if( GoogleBatchMachineTypeSelector.isPdOnly(machineTypeOrFamily) ){ + return 'pd-balanced' + } else { + return 'hyperdisk-balanced' + } + } + /** * Build the allocation policy for the job * diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy index 607eb4779a..ba2ac43f4b 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy @@ -34,6 +34,12 @@ class GoogleBatchMachineTypeSelectorTest extends Specification { new MachineType(type: 'm2-type08', family: 'm2', 'spotPrice': 0.036, 'onDemandPrice': 0.35, 'cpusPerVm': 8, 'memPerVm': 8), new MachineType(type: 'n2-type09', family: 'n2', 'spotPrice': 0.040, 'onDemandPrice': 0.40, 'cpusPerVm': 10, 'memPerVm': 10), new MachineType(type: 'c2-type10', family: 'c2', 'spotPrice': 0.045, 'onDemandPrice': 0.45, 'cpusPerVm': 10, 'memPerVm': 10), + new MachineType(type: 'c4-type11', family: 'c4', 'spotPrice': 0.040, 'onDemandPrice': 0.40, 'cpusPerVm': 8, 'memPerVm': 8), + new MachineType(type: 'c4a-type12', family: 'c4a', 'spotPrice': 0.038, 'onDemandPrice': 0.38, 'cpusPerVm': 8, 'memPerVm': 8), + new MachineType(type: 'c4d-type13', family: 'c4d', 'spotPrice': 0.039, 'onDemandPrice': 0.39, 'cpusPerVm': 8, 'memPerVm': 8), + new MachineType(type: 'n4-type14', family: 'n4', 'spotPrice': 0.035, 'onDemandPrice': 0.35, 'cpusPerVm': 8, 'memPerVm': 8), + new MachineType(type: 'n4a-type15', family: 'n4a', 'spotPrice': 0.033, 'onDemandPrice': 0.33, 'cpusPerVm': 8, 'memPerVm': 8), + new MachineType(type: 'n4d-type16', family: 'n4d', 'spotPrice': 0.034, 'onDemandPrice': 0.34, 'cpusPerVm': 8, 'memPerVm': 8), ] def 'should select best machine type'() { @@ -57,6 +63,12 @@ class GoogleBatchMachineTypeSelectorTest extends Specification { 8 | 8000 | 'reg' | true | false | null | 'm1-type07' 8 | 8000 | 'reg' | false | false | ['m?-*', 'c2-*'] | 'm2-type08' 8 | 8000 | 'reg' | false | false | ['m1-type07', 'm2-type66'] | 'm1-type07' + 8 | 8000 | 'reg' | true | false | ['c4-*'] | 'c4-type11' + 8 | 8000 | 'reg' | true | false | ['c4a-*'] | 'c4a-type12' + 8 | 8000 | 'reg' | true | false | ['c4d-*'] | 'c4d-type13' + 8 | 8000 | 'reg' | true | false | ['n4-*'] | 'n4-type14' + 8 | 8000 | 'reg' | true | false | ['n4a-*'] | 'n4a-type15' + 8 | 8000 | 'reg' | true | false | ['n4d-*'] | 'n4d-type16' } @@ -113,6 +125,27 @@ class GoogleBatchMachineTypeSelectorTest extends Specification { '200 GB' | 'c2-standard-4' | 'c2' | 4 | '375 GB' '50 GB' | 'c2d-highmem-56' | 'c2d' | 56 | '1500 GB' '750 GB' | 'm3-megamem-64' | 'm3' | 64 | '1500 GB' + '100 GB' | 'c4-standard-8-lssd' | 'c4' | 8 | '0' + '100 GB' | 'c4a-standard-8-lssd' | 'c4a' | 8 | '0' + '100 GB' | 'c4d-standard-8-lssd' | 'c4d' | 8 | '0' + } + + def 'should know when hyperdisk is required'() { + expect: + GoogleBatchMachineTypeSelector.INSTANCE.isHyperdiskOnly(TYPE) == EXPECTED + + where: + TYPE | EXPECTED + 'c4-standard-8' | true + 'c4a-standard-8' | true + 'c4d-standard-8' | true + 'n4-standard-8' | true + 'n4a-standard-8' | true + 'n4d-standard-8' | true + 'n1-standard-8' | false + 'n2-standard-8' | false + 'e2-standard-8' | false + 'c2-standard-8' | false } def 'should know when to install GPU drivers'() { @@ -128,4 +161,34 @@ class GoogleBatchMachineTypeSelectorTest extends Specification { 'a3-highgpu-1g' | 0 | true 'g2-standard-4' | 0 | true } + + def 'should detect non-configurable local SSD'() { + expect: + final machineType = new MachineType(type: TYPE, family: FAMILY) + GoogleBatchMachineTypeSelector.INSTANCE.notConfigurableLocalSSD(machineType) == EXPECTED + + where: + TYPE | FAMILY | EXPECTED + // c3/c3d with -lssd suffix → true + 'c3-standard-8-lssd' | 'c3' | true + 'c3d-standard-8-lssd' | 'c3d' | true + // c4/c4a/c4d with -lssd suffix → true + 'c4-standard-8-lssd' | 'c4' | true + 'c4a-standard-8-lssd' | 'c4a' | true + 'c4d-standard-8-lssd' | 'c4d' | true + // a3 family → always true regardless of type + 'a3-highgpu-8g' | 'a3' | true + 'a3-megagpu-64g' | 'a3' | true + // a2-ultragpu- prefix → true regardless of family + 'a2-ultragpu-1g' | 'a2' | true + 'a2-ultragpu-8g' | 'a2' | true + // c3/c4 without -lssd suffix → false + 'c3-standard-8' | 'c3' | false + 'c4-standard-8' | 'c4' | false + // a2 non-ultragpu → false + 'a2-highgpu-1g' | 'a2' | false + // unrelated families → false + 'n2-standard-4' | 'n2' | false + 'e2-standard-8' | 'e2' | false + } } diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy index d1a8155658..fbe88ab060 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy @@ -989,6 +989,84 @@ class GoogleBatchTaskHandlerTest extends Specification { result == null } + def 'should set hyperdisk-balanced boot disk for hyperdisk-only machine families' () { + given: + def WORK_DIR = CloudStorageFileSystem.forBucket('foo').getPath('/scratch') + def CONTAINER_IMAGE = 'debian:latest' + def MACHINE_TYPE = 'c4-standard-8' + def exec = Mock(GoogleBatchExecutor) { + getBatchConfig() >> Mock(BatchConfig) + } + and: + def bean = new TaskBean(workDir: WORK_DIR, inputFiles: [:]) + def task = Mock(TaskRun) { + toTaskBean() >> bean + getHashLog() >> 'abcd1234' + getWorkDir() >> WORK_DIR + getContainer() >> CONTAINER_IMAGE + getConfig() >> Mock(TaskConfig) { + getCpus() >> 8 + getResourceLabels() >> [:] + } + } + and: + def mounts = ['/mnt/disks/foo/scratch:/mnt/disks/foo/scratch:rw'] + def launcher = new GoogleBatchLauncherSpecMock('bash .command.run', mounts, []) + + and: + def handler = Spy(new GoogleBatchTaskHandler(task, exec)) + + when: + def req = handler.newSubmitRequest(task, launcher) + then: + handler.fusionEnabled() >> false + handler.findBestMachineType(_, false) >> new GoogleBatchMachineTypeSelector.MachineType(type: MACHINE_TYPE, family: 'c4', location: 'us-central1', priceModel: PriceModel.standard) + + and: + def instancePolicy = req.getAllocationPolicy().getInstances(0).getPolicy() + instancePolicy.getMachineType() == MACHINE_TYPE + instancePolicy.getBootDisk().getType() == 'hyperdisk-balanced' + } + + def 'should not set hyperdisk boot disk for standard machine families' () { + given: + def WORK_DIR = CloudStorageFileSystem.forBucket('foo').getPath('/scratch') + def CONTAINER_IMAGE = 'debian:latest' + def MACHINE_TYPE = 'n2-standard-8' + def exec = Mock(GoogleBatchExecutor) { + getBatchConfig() >> Mock(BatchConfig) + } + and: + def bean = new TaskBean(workDir: WORK_DIR, inputFiles: [:]) + def task = Mock(TaskRun) { + toTaskBean() >> bean + getHashLog() >> 'abcd1234' + getWorkDir() >> WORK_DIR + getContainer() >> CONTAINER_IMAGE + getConfig() >> Mock(TaskConfig) { + getCpus() >> 8 + getResourceLabels() >> [:] + } + } + and: + def mounts = ['/mnt/disks/foo/scratch:/mnt/disks/foo/scratch:rw'] + def launcher = new GoogleBatchLauncherSpecMock('bash .command.run', mounts, []) + + and: + def handler = Spy(new GoogleBatchTaskHandler(task, exec)) + + when: + def req = handler.newSubmitRequest(task, launcher) + then: + handler.fusionEnabled() >> false + handler.findBestMachineType(_, false) >> new GoogleBatchMachineTypeSelector.MachineType(type: MACHINE_TYPE, family: 'n2', location: 'us-central1', priceModel: PriceModel.standard) + + and: + def instancePolicy = req.getAllocationPolicy().getInstances(0).getPolicy() + instancePolicy.getMachineType() == MACHINE_TYPE + instancePolicy.getBootDisk().getType() == '' + } + // ========================================================================== // Tests for extracted helper methods // ========================================================================== @@ -1250,4 +1328,53 @@ class GoogleBatchTaskHandlerTest extends Specification { result.getContainer().getOptions() == '--privileged' } + def 'should choose fusion disk type'() { + given: + def exec = Mock(GoogleBatchExecutor) { + getBatchConfig() >> Mock(BatchConfig) + } + def task = Mock(TaskRun) { + getHashLog() >> 'abcd1234' + getWorkDir() >> CloudStorageFileSystem.forBucket('foo').getPath('/scratch') + getContainer() >> 'ubuntu:latest' + getConfig() >> Mock(TaskConfig) + } + def handler = new GoogleBatchTaskHandler(task, exec) + + expect: + handler.chooseFusionDiskType(machineType) == expected + + where: + machineType | expected + // exact machine types + 'n2-standard-4' | 'local-ssd' // standard family: supports local SSD + 'c2-standard-8' | 'local-ssd' // compute-optimized: supports local SSD + 'n1-standard-1' | 'local-ssd' // legacy N1: supports local SSD + 'e2-standard-4' | 'pd-balanced' // E2: no local SSD, pd-only + 'e2-micro' | 'pd-balanced' // E2 small: no local SSD, pd-only + 'h3-standard-88' | 'hyperdisk-balanced' // H3: no local SSD, not pd-only + 'n4-standard-4' | 'hyperdisk-balanced' // N4: no local SSD, not pd-only + 't2a-standard-4' | 'hyperdisk-balanced' // T2A: no local SSD, not pd-only + 't2d-standard-4' | 'hyperdisk-balanced' // T2D: no local SSD, not pd-only + 'm2-ultramem-208' | 'hyperdisk-balanced' // M2: no local SSD, not pd-only + 'm4-megamem-416' | 'hyperdisk-balanced' // M4: no local SSD, not pd-only + 'x4-megamem-1440' | 'hyperdisk-balanced' // X4: no local SSD, not pd-only + // glob families with * (family glob passed directly) + 'n2-*' | 'local-ssd' // N2 family glob: supports local SSD + 'c3-*' | 'local-ssd' // C3 family glob: partial lssd support, not in NO_LOCAL_SSD + 'a2-*' | 'local-ssd' // A2 accelerator family glob: supports local SSD + 'e2-*' | 'pd-balanced' // E2 family glob: no local SSD, pd-only + 'h3-*' | 'hyperdisk-balanced' // H3 family glob: no local SSD, not pd-only + 'n4-*' | 'hyperdisk-balanced' // N4 family glob: no local SSD, not pd-only + 't2a-*' | 'hyperdisk-balanced' // T2A family glob: no local SSD, not pd-only + 'm2-*' | 'hyperdisk-balanced' // M2 family glob: no local SSD, not pd-only + // glob patterns with ? in the type suffix (? is matched literally by .* in family regex) + 'n2-standard-?' | 'local-ssd' // n2 prefix still not in NO_LOCAL_SSD + 'e2-standard-?' | 'pd-balanced' // e2 prefix matches e2-* → pd-only + 'h3-standard-??' | 'hyperdisk-balanced' // h3 prefix matches h3-* → no local SSD, not pd-only + // ? replacing the family discriminator digit is treated as a literal character (not a wildcard) + 'e?-standard-4' | 'local-ssd' // 'e?' does not match regex ^e2-.*$, so not classified as e2 + 'h?-standard-88' | 'local-ssd' // 'h?' does not match regex ^h3-.*$, so not classified as h3 + } + }