Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,37 @@ This map tells the program, which units to expect in the respective columns. It

There is currently **no support** to map scaled units to base units (e.g. 1 kW -> 1000 W).

### TDP value conversion
The TDP values are given in a different format depending on the manufacturer / CPU model.
They have been brought into a standardized format. If a range of values was specified, the lowest value was always used.
The following table shows examples of the different formats on the left and the standardized version on the right.

<table>
<tr><th>Before:</th><th></th><th>After:</th></tr>
<tr><td>

| tdp (W) |
|----------|
| 15-30 |
| 1.5/20 |
| 3.1--6 |

</td><td></td><td>

| tdp (W) |
|---------|
| 15 |
| 1.5 |
| 3.1 |

</td></tr> </table>

### Default TDP value computation
Default TDP values are computed per intended use of the processors listed in the table.
For this purpose only processors with a `launch date/last time buy` date nor older than the `current year -
10 years` are considered.
The year information can be changed in the `computeDefaultTdps`function. The default TDP values are added to the output csv file as bottom rows.

## Contibuting
### Linting
The linting rules are specified in `.groovylintrc.json` please apply them, when contributing new code.
Expand Down
49 changes: 29 additions & 20 deletions app/src/main/groovy/org/cpuInfoFetcher/Main.groovy
Original file line number Diff line number Diff line change
@@ -1,21 +1,10 @@
package org.cpuinfofetcher

import org.cpuinfofetcher.utils.Helpers
import org.cpuinfofetcher.utils.UnitsAdapter
import org.dflib.Exp
import org.dflib.Series
import org.dflib.Printers

import static org.dflib.Exp.*
import java.time.LocalDateTime


import java.nio.file.Files
import java.util.logging.Logger

import java.nio.file.Paths

import org.dflib.DataFrame
import org.dflib.Printers
import org.dflib.JoinType
import org.dflib.csv.Csv

Expand Down Expand Up @@ -49,6 +38,9 @@ class Main {
'cores': ['Total Cores', '# of CPU Cores', 'cores'],
'threads': ['cores', 'Total Cores', '# of CPU Cores', 'Total Threads', '# of Threads', 'threads']
]

static Map<String, List<String>> specification_aliases_retain_null_entries = ['name': ['name'], "Launch Year/Last Time Buy": ["Launch Year/Last Time Buy"]]

// Mapping units to columns
static Map<String, List<String>> units_mapping = ['tdp': ['W', 'Watt']]

Expand Down Expand Up @@ -85,10 +77,6 @@ class Main {
return specifications
}

static DataFrame removeDuplicates(DataFrame specifications) {
return specifications.rows().selectUnique('name')
}



static void main(String[] args) {
Expand All @@ -104,26 +92,47 @@ class Main {

// Merging Info into big file
DataFrame specifications = mergeSpecifications(specificationsList)
specifications = removeDuplicates(specifications)

// Remove duplicate rows
specifications = ProcessSpecificationsTable.removeDuplicates(specifications)

// Extract a uniform year for all rows
specifications = ProcessSpecificationsTable.extractUniformYearColumn(specifications)

Csv.save(specifications, Paths.get('..', 'specifications_out', 'specifications.csv'))
this.specifications = specifications
LOGGER.info('Merged all specifications.')

// Selecting relevant information
CPUSpecificationsSummarizer summarizer = new CPUSpecificationsSummarizer()
DataFrame selected_specifications = summarizer.extract_selection(
specifications,

selected_specifications = summarizer.extract_selection(
specifications,
this.specification_aliases,
true
)

// Add "Launch Year/Last Time Buy" column
DataFrame columns_to_add = summarizer.extract_selection(
specifications,
this.specification_aliases_retain_null_entries,
false
)

// Perform Left Join (keeping all rows of selected_specifications)
def selected_specifications = selected_specifications.join(columns_to_add)
.on("name")
.colsExcept(c -> c.endsWith("_"))
.select()

LOGGER.info('Extracted information.')

UnitsAdapter ua = new UnitsAdapter()
selected_specifications = ua.unitToColumnName(selected_specifications, this.units_mapping)
LOGGER.info('Extracted units from data.')

// adjusts format of tdp values to make them uniform
selected_specifications = ua.extractFirstNumber(selected_specifications)
selected_specifications = ProcessSpecificationsTable.extractFirstNumber(selected_specifications)

// add default TDPs
selected_specifications = ProcessSpecificationsTable.computeDefaultTdps(selected_specifications)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,94 @@
package org.cpuinfofetcher
import org.cpuinfofetcher.utils.Helpers
import org.dflib.DataFrame
import org.dflib.Printers
import java.time.LocalDateTime
import static org.dflib.Exp.$col
import java.time.Year

/**
* Adapts columns with units to be more uniform
*/
class UnitsAdapter {

static DataFrame unitToColumnName(DataFrame df, Map<String, String[]> unit_mapping) {
List<String> old_col_names = df.getColumnsIndex().toArray()
// Define new column names with units
List<String> new_col_names = []
for (String col_name : old_col_names) {
List<String> units = unit_mapping.get(col_name)
if (units == null || col_name.endsWith("(${units.get(0)})")) {
new_col_names.add(col_name)
} else {
new_col_names.add("${col_name} (${units.get(0)})")
}
}

// Extract units from values
def new_df = DataFrame.byArrayRow(*new_col_names).appender()
for (int i = 0; i < df.height(); i++) {
List row = []
for (int j = 0; j < df.width(); j++) {
List<String> units = unit_mapping.get(old_col_names.get(j))
String value = df.get(j, i)
if (units != null) {
for (String unit : units) {
value = value.replaceAll("${units.get(0)}", '').replaceAll(' ', '')
}
}
row.add(value)
}

new_df.append(*row)
}
new_df = new_df.toDataFrame()

return new_df
}

}

/**
* Extracts the first numeric value from the 'tdp (W)' column and updates the DataFrame.
*
* Examples of extraction:
* - "15-30" --> 15
* - "1.5/20" --> 1.5
* - "3.1--6" --> 3.1
*
* @param df the input DataFrame
* @return a DataFrame with the updated 'tdp (W)' column containing only the first numeric value
*/
static DataFrame extractFirstNumber(DataFrame df) {
DataFrame old_df = df.cols().selectAs(Map.of("tdp (W)", "tdp old"))
def new_df = DataFrame.empty("tdp (W)")

for (int i = 0; i < old_df.height(); i++) {
String tdp_value = old_df.rows(i).select().get("tdp old", 0)
// Use the matcher to extract the first number
def matcher = tdp_value =~ /^[0-9]*\.?[0-9]+/ // Regex to match the first number (integer or decimal)
def new_tdp_value = matcher.find() ? Double.parseDouble(matcher.group(0)) : null
new_df = new_df.addRow("tdp (W)": new_tdp_value)

}
new_df = old_df.hConcat(new_df).colsExcept('tdp old').select()

return new_df
}


/**
* Removes duplicate rows from the DataFrame based on the 'name' column.
*
* @param specifications the DataFrame to process.
* @return a DataFrame with unique rows by 'name'.
*/

static DataFrame removeDuplicates(DataFrame specifications) {
return specifications.rows().selectUnique('name')
}



/**
* Adds default TDP values to the specifications DataFrame.
Expand All @@ -14,7 +100,14 @@ import static org.dflib.Exp.$col
* @return the updated DataFrame with default TDP values
*/
static DataFrame computeDefaultTdps(DataFrame specifications) {
DataFrame aggregatedDf = specifications.group('intended_usage').agg(
int currentYear = Year.now().getValue()

DataFrame specifications_filtered = specifications.rows({ row ->
def year = row.get("Launch Year/Last Time Buy") as Integer
return year != null && year >= currentYear - 10
}).select()

DataFrame aggregatedDf = specifications_filtered.group('intended_usage').agg(
$col('intended_usage').first().as('intended_usage'),
$col('cores').castAsInt().avg().as("avg_cores"),
$col('threads').castAsInt().avg().as("avg_threads"),
Expand Down Expand Up @@ -55,4 +148,91 @@ static DataFrame computeDefaultTdps(DataFrame specifications) {
}

return specifications
}
}



static DataFrame extractUniformYearColumn(DataFrame df) {
// Helper method to extract year from various formats
// Match and parse specific patterns
def extractYear = { value ->
if (value == null || value.toString().trim().isEmpty()) {
return null // Handle null or empty input
}

value = value.toString().trim() // Ensure the value is a trimmed string

if (value =~ /^\d{4}$/) { // Matches "2023" (4-digit year)
return value
} else if (value =~ /^\d{1,2}\/\d{1,2}\/\d{2,4}$/) { // Matches "3/22/22" or "03/15/2021"
def parts = value.split("/")
return parts[-1].length() == 2 ? "20${parts[-1]}" : parts[-1] // Handle yy or yyyy
} else if (value =~ /^Q[1-4]'\d{2}$/) { // Matches "Q2'22" (quarter-year format with short year)
return "20" + value[-2..-1]
} else if (value =~ /^Q[1-4]\d{4}$/) { // Matches "Q12021" (quarter-year full format)
return value[-4..-1]
} else if (value =~ /^Q[1-4]\s?\d{2,4}$/) { // Matches "Q217" or "Q2 2026"
def extractedYear = value.replaceAll(/Q[1-4]\s?/, "") // Extract the number after "Q"
return extractedYear.length() == 2 ? "20" + extractedYear : extractedYear
} else if (value =~ /^[1-4]Q\s\d{4}$/) { // Matches "3Q 2016"
return value[-4..-1]
} else if (value =~ /^[A-Za-z]+\s\d{4}$/) { // Matches "June 2017" (Month-Year format)
return value.replaceAll(/[^\d]/, "") // Remove non-digit characters, keep year
} else if (value =~ /^\d{1,2}\/\d{4}$/) { // Matches "06/2017" (MM/YYYY format)
return value.split("/")[1] // Extract year from MM/YYYY format
} else if (value =~ /^[1-4]Q\d{2}$/) { // Matches "2Q18" (Quarter-Year with short year)
return "20" + value[-2..-1]
} else if (value =~ /^\d{2}'\d{2}$/) { // Matches "04'16" (Month-Year short year MM'YY)
return "20" + value[-2..-1] // Extract "16" and convert to "2016"
} else if (value =~ /\b\d{4}\b/) { // Matches all 4-digit years in a string
def matcher = (value =~ /\b\d{4}\b/) // Find all 4-digit years
def allYears = matcher.collect { it.toInteger() } // Collect all matched years as integers
return allYears.min() // Return the earliest year (minimum)
} else {
return null // Return null if no patterns match
}
}

def new_df = DataFrame.empty("Launch Year/Last Time Buy")

// Create a new column that contains the uniform year format
df.rows().select().each { row ->
def launchDate = row.get("Launch Date") // Get value for "Launch Date"
def lastTimeBuy = row.get("Last Time Buy") // Get value for "Last Time Buy"

// Check for "Launch Date" or fall back to "Last Time Buy"
Integer year = extractYear(launchDate ?: lastTimeBuy) as Integer
new_df = new_df.addRow("Launch Year/Last Time Buy": year)
}
//new_df = df.hConcat(new_df).colsExcept("Launch Date", "Last Time Buy").select()
new_df = df.hConcat(new_df)

// Iterate through the rows and print results
for (int i = 0; i < new_df.height(); i++) {
def name = new_df.getColumn("name").get(i) // # Processor name
def launchDate = new_df.getColumn("Launch Date").get(i) // Original "Launch Date"
def lastTimeBuy = new_df.getColumn("Last Time Buy").get(i) // Original "Last Time Buy"
def uniformYear = new_df.getColumn("Launch Year/Last Time Buy").get(i) // New "Uniform Launch Year"


if ((!(launchDate == null || launchDate.toString().trim().isEmpty())
|| !(lastTimeBuy == null || lastTimeBuy.toString().trim().isEmpty()))
&& (uniformYear == null || uniformYear.toString().trim().isEmpty())) {
println "Warning: ${name} -> No valid uniform year found! (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy, Uniform Year: $uniformYear)"
}

//println "Processor: $name, Uniform Year: $uniformYear (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy)"
//if (uniformYear != null) {
// println "Row ${i + 1}: Uniform Year is $uniformYear (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy)"
//} else {
// println "Row ${i + 1}: Warning - No valid value found in 'Launch Date' or 'Last Time Buy'! (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy)"
//}
}

//String table = Printers.tabular.toString(new_df.head(10));
//System.out.println(table);
return new_df
}



Loading