qbic-projects · JosuaCarl · Mar 31, 2025 · Mar 31, 2025
diff --git a/README.md b/README.md
@@ -82,6 +82,37 @@ This map tells the program, which units to expect in the respective columns. It
 
 There is currently **no support** to map scaled units to base units (e.g. 1 kW -> 1000 W).
 
+### TDP value conversion
+The TDP values are given in a different format depending on the manufacturer / CPU model.
+They have been brought into a standardized format. If a range of values was specified, the lowest value was always used.
+The following table shows examples of the different formats on the left and the standardized version on the right.
+
+<table>
+<tr><th>Before:</th><th></th><th>After:</th></tr>
+<tr><td>
+
+| tdp (W)  |
+|----------|
+| 15-30    |
+| 1.5/20   |
+| 3.1--6   |
+
+</td><td></td><td>
+
+| tdp (W) |
+|---------|
+| 15      |
+| 1.5     |
+| 3.1     |
+
+</td></tr> </table>
+
+### Default TDP value computation
+Default TDP values are computed per intended use of the processors listed in the table.
+For this purpose only processors with a `launch date/last time buy` date nor older than the `current year -
+10 years` are considered.
+The year information can be changed in the `computeDefaultTdps`function. The default TDP values are added to the output csv file as bottom rows. 
+
 ## Contibuting
 ### Linting
 The linting rules are specified in `.groovylintrc.json` please apply them, when contributing new code.

diff --git a/app/src/main/groovy/org/cpuInfoFetcher/Main.groovy b/app/src/main/groovy/org/cpuInfoFetcher/Main.groovy
@@ -1,21 +1,10 @@
 package org.cpuinfofetcher
 
-import org.cpuinfofetcher.utils.Helpers
-import org.cpuinfofetcher.utils.UnitsAdapter
-import org.dflib.Exp
-import org.dflib.Series
-import org.dflib.Printers
-
-import static org.dflib.Exp.*
-import java.time.LocalDateTime
-
-
 import java.nio.file.Files
 import java.util.logging.Logger
-
 import java.nio.file.Paths
-
 import org.dflib.DataFrame
+import org.dflib.Printers
 import org.dflib.JoinType
 import org.dflib.csv.Csv
 
@@ -49,6 +38,9 @@ class Main {
         'cores': ['Total Cores', '# of CPU Cores', 'cores'],
         'threads': ['cores', 'Total Cores', '# of CPU Cores', 'Total Threads', '# of Threads', 'threads']
     ]
+
+    static Map<String, List<String>> specification_aliases_retain_null_entries = ['name': ['name'], "Launch Year/Last Time Buy": ["Launch Year/Last Time Buy"]]
+
     // Mapping units to columns
     static Map<String, List<String>> units_mapping = ['tdp': ['W', 'Watt']]
 
@@ -85,10 +77,6 @@ class Main {
         return specifications
     }
 
-    static DataFrame removeDuplicates(DataFrame specifications) {
-        return specifications.rows().selectUnique('name')
-    }
-
 
 
     static void main(String[] args) {
@@ -104,26 +92,47 @@ class Main {
 
         // Merging Info into big file
         DataFrame specifications = mergeSpecifications(specificationsList)
-        specifications = removeDuplicates(specifications)
+
+        // Remove duplicate rows
+        specifications = ProcessSpecificationsTable.removeDuplicates(specifications)
+
+        // Extract a uniform year for all rows
+        specifications = ProcessSpecificationsTable.extractUniformYearColumn(specifications)
+
         Csv.save(specifications, Paths.get('..', 'specifications_out', 'specifications.csv'))
         this.specifications = specifications
         LOGGER.info('Merged all specifications.')
 
         // Selecting relevant information
         CPUSpecificationsSummarizer summarizer = new CPUSpecificationsSummarizer()
-        DataFrame selected_specifications = summarizer.extract_selection(
-            specifications,
+
+        selected_specifications = summarizer.extract_selection(
+                specifications,
             this.specification_aliases,
             true
         )
+
+        // Add "Launch Year/Last Time Buy" column
+        DataFrame columns_to_add = summarizer.extract_selection(
+                specifications,
+                this.specification_aliases_retain_null_entries,
+                false
+        )
+
+        // Perform Left Join (keeping all rows of selected_specifications)
+        def selected_specifications = selected_specifications.join(columns_to_add)
+                .on("name")
+                .colsExcept(c -> c.endsWith("_"))
+                .select()
+
         LOGGER.info('Extracted information.')
 
         UnitsAdapter ua = new UnitsAdapter()
         selected_specifications = ua.unitToColumnName(selected_specifications, this.units_mapping)
         LOGGER.info('Extracted units from data.')
 
         // adjusts format of tdp values to make them uniform
-        selected_specifications = ua.extractFirstNumber(selected_specifications)
+        selected_specifications = ProcessSpecificationsTable.extractFirstNumber(selected_specifications)
 
         // add default TDPs
         selected_specifications = ProcessSpecificationsTable.computeDefaultTdps(selected_specifications)

diff --git a/app/src/main/groovy/org/cpuInfoFetcher/ProcessSpecificationsTable.groovy b/app/src/main/groovy/org/cpuInfoFetcher/ProcessSpecificationsTable.groovy
@@ -1,8 +1,94 @@
 package org.cpuinfofetcher
 import org.cpuinfofetcher.utils.Helpers
 import org.dflib.DataFrame
+import org.dflib.Printers
 import java.time.LocalDateTime
 import static org.dflib.Exp.$col
+import java.time.Year
+
+/**
+ * Adapts columns with units to be more uniform
+ */
+class UnitsAdapter {
+
+    static DataFrame unitToColumnName(DataFrame df, Map<String, String[]> unit_mapping) {
+        List<String> old_col_names = df.getColumnsIndex().toArray()
+        // Define new column names with units
+        List<String> new_col_names = []
+        for (String col_name : old_col_names) {
+            List<String> units = unit_mapping.get(col_name)
+            if (units == null || col_name.endsWith("(${units.get(0)})")) {
+                new_col_names.add(col_name)
+            } else {
+                new_col_names.add("${col_name} (${units.get(0)})")
+            }
+        }
+
+        // Extract units from values
+        def new_df = DataFrame.byArrayRow(*new_col_names).appender()
+        for (int i = 0; i < df.height(); i++) {
+            List row = []
+            for (int j = 0; j < df.width(); j++) {
+                List<String> units = unit_mapping.get(old_col_names.get(j))
+                String value = df.get(j, i)
+                if (units != null) {
+                    for (String unit : units) {
+                        value = value.replaceAll("${units.get(0)}", '').replaceAll(' ', '')
+                    }
+                }
+                row.add(value)
+            }
+
+            new_df.append(*row)
+        }
+        new_df = new_df.toDataFrame()
+
+        return new_df
+    }
+
+}
+
+/**
+ * Extracts the first numeric value from the 'tdp (W)' column and updates the DataFrame.
+ *
+ * Examples of extraction:
+ * - "15-30"   --> 15
+ * - "1.5/20"  --> 1.5
+ * - "3.1--6"  --> 3.1
+ *
+ * @param df the input DataFrame
+ * @return a DataFrame with the updated 'tdp (W)' column containing only the first numeric value
+ */
+static DataFrame extractFirstNumber(DataFrame df) {
+    DataFrame old_df = df.cols().selectAs(Map.of("tdp (W)", "tdp old"))
+    def new_df = DataFrame.empty("tdp (W)")
+
+    for (int i = 0; i < old_df.height(); i++) {
+        String tdp_value = old_df.rows(i).select().get("tdp old", 0)
+        // Use the matcher to extract the first number
+        def matcher = tdp_value =~ /^[0-9]*\.?[0-9]+/ // Regex to match the first number (integer or decimal)
+        def new_tdp_value = matcher.find() ? Double.parseDouble(matcher.group(0)) : null
+        new_df = new_df.addRow("tdp (W)": new_tdp_value)
+
+    }
+    new_df = old_df.hConcat(new_df).colsExcept('tdp old').select()
+
+    return new_df
+}
+
+
+/**
+ * Removes duplicate rows from the DataFrame based on the 'name' column.
+ *
+ * @param specifications the DataFrame to process.
+ * @return a DataFrame with unique rows by 'name'.
+ */
+
+static DataFrame removeDuplicates(DataFrame specifications) {
+    return specifications.rows().selectUnique('name')
+}
+
+
 
 /**
  * Adds default TDP values to the specifications DataFrame.
@@ -14,7 +100,14 @@ import static org.dflib.Exp.$col
  * @return the updated DataFrame with default TDP values
  */
 static DataFrame computeDefaultTdps(DataFrame specifications) {
-    DataFrame aggregatedDf = specifications.group('intended_usage').agg(
+    int currentYear = Year.now().getValue()
+
+    DataFrame specifications_filtered = specifications.rows({ row ->
+        def year = row.get("Launch Year/Last Time Buy") as Integer
+        return year != null && year >= currentYear - 10
+    }).select()
+
+    DataFrame aggregatedDf = specifications_filtered.group('intended_usage').agg(
             $col('intended_usage').first().as('intended_usage'),
             $col('cores').castAsInt().avg().as("avg_cores"),
             $col('threads').castAsInt().avg().as("avg_threads"),
@@ -55,4 +148,91 @@ static DataFrame computeDefaultTdps(DataFrame specifications) {
     }
 
     return specifications
-}
+}
+
+
+
+static DataFrame extractUniformYearColumn(DataFrame df) {
+    // Helper method to extract year from various formats
+    // Match and parse specific patterns
+    def extractYear = { value ->
+        if (value == null || value.toString().trim().isEmpty()) {
+            return null // Handle null or empty input
+        }
+
+        value = value.toString().trim() // Ensure the value is a trimmed string
+
+        if (value =~ /^\d{4}$/) { // Matches "2023" (4-digit year)
+            return value
+        } else if (value =~ /^\d{1,2}\/\d{1,2}\/\d{2,4}$/) { // Matches "3/22/22" or "03/15/2021"
+            def parts = value.split("/")
+            return parts[-1].length() == 2 ? "20${parts[-1]}" : parts[-1] // Handle yy or yyyy
+        } else if (value =~ /^Q[1-4]'\d{2}$/) { // Matches "Q2'22" (quarter-year format with short year)
+            return "20" + value[-2..-1]
+        } else if (value =~ /^Q[1-4]\d{4}$/) { // Matches "Q12021" (quarter-year full format)
+            return value[-4..-1]
+        } else if (value =~ /^Q[1-4]\s?\d{2,4}$/) { // Matches "Q217" or "Q2 2026"
+            def extractedYear = value.replaceAll(/Q[1-4]\s?/, "") // Extract the number after "Q"
+            return extractedYear.length() == 2 ? "20" + extractedYear : extractedYear
+        } else if (value =~ /^[1-4]Q\s\d{4}$/) { // Matches "3Q 2016"
+            return value[-4..-1]
+        } else if (value =~ /^[A-Za-z]+\s\d{4}$/) { // Matches "June 2017" (Month-Year format)
+            return value.replaceAll(/[^\d]/, "") // Remove non-digit characters, keep year
+        } else if (value =~ /^\d{1,2}\/\d{4}$/) { // Matches "06/2017" (MM/YYYY format)
+            return value.split("/")[1] // Extract year from MM/YYYY format
+        } else if (value =~ /^[1-4]Q\d{2}$/) { // Matches "2Q18" (Quarter-Year with short year)
+            return "20" + value[-2..-1]
+        } else if (value =~ /^\d{2}'\d{2}$/) { // Matches "04'16" (Month-Year short year MM'YY)
+            return "20" + value[-2..-1] // Extract "16" and convert to "2016"
+        } else if (value =~ /\b\d{4}\b/) { // Matches all 4-digit years in a string
+            def matcher = (value =~ /\b\d{4}\b/) // Find all 4-digit years
+            def allYears = matcher.collect { it.toInteger() } // Collect all matched years as integers
+            return allYears.min() // Return the earliest year (minimum)
+        } else {
+            return null // Return null if no patterns match
+        }
+    }
+
+    def new_df = DataFrame.empty("Launch Year/Last Time Buy")
+
+    // Create a new column that contains the uniform year format
+    df.rows().select().each { row ->
+        def launchDate = row.get("Launch Date") // Get value for "Launch Date"
+        def lastTimeBuy = row.get("Last Time Buy") // Get value for "Last Time Buy"
+
+        // Check for "Launch Date" or fall back to "Last Time Buy"
+        Integer year = extractYear(launchDate ?: lastTimeBuy) as Integer
+        new_df = new_df.addRow("Launch Year/Last Time Buy": year)
+    }
+    //new_df = df.hConcat(new_df).colsExcept("Launch Date", "Last Time Buy").select()
+    new_df = df.hConcat(new_df)
+
+    // Iterate through the rows and print results
+    for (int i = 0; i < new_df.height(); i++) {
+        def name = new_df.getColumn("name").get(i) // # Processor name
+        def launchDate = new_df.getColumn("Launch Date").get(i) // Original "Launch Date"
+        def lastTimeBuy = new_df.getColumn("Last Time Buy").get(i) // Original "Last Time Buy"
+        def uniformYear = new_df.getColumn("Launch Year/Last Time Buy").get(i) // New "Uniform Launch Year"
+
+
+        if ((!(launchDate == null || launchDate.toString().trim().isEmpty())
+                || !(lastTimeBuy == null || lastTimeBuy.toString().trim().isEmpty()))
+                && (uniformYear == null || uniformYear.toString().trim().isEmpty())) {
+            println "Warning: ${name} -> No valid uniform year found! (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy, Uniform Year: $uniformYear)"
+        }
+
+        //println "Processor: $name, Uniform Year: $uniformYear (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy)"
+        //if (uniformYear != null) {
+        //    println "Row ${i + 1}: Uniform Year is $uniformYear (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy)"
+        //} else {
+        //    println "Row ${i + 1}: Warning - No valid value found in 'Launch Date' or 'Last Time Buy'! (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy)"
+        //}
+    }
+
+    //String table = Printers.tabular.toString(new_df.head(10));
+    //System.out.println(table);
+    return new_df
+}
+
+
+