Skip to content

Commit 0cfa4ab

Browse files
authored
Merge pull request #17 from nadnein/main
updated README.md and tests and finalized code
2 parents ffe8c38 + 4e387db commit 0cfa4ab

10 files changed

Lines changed: 11954 additions & 11825 deletions

File tree

README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,37 @@ This map tells the program, which units to expect in the respective columns. It
8282

8383
There is currently **no support** to map scaled units to base units (e.g. 1 kW -> 1000 W).
8484

85+
### TDP value conversion
86+
The TDP values are given in a different format depending on the manufacturer / CPU model.
87+
They have been brought into a standardized format. If a range of values was specified, the lowest value was always used.
88+
The following table shows examples of the different formats on the left and the standardized version on the right.
89+
90+
<table>
91+
<tr><th>Before:</th><th></th><th>After:</th></tr>
92+
<tr><td>
93+
94+
| tdp (W) |
95+
|----------|
96+
| 15-30 |
97+
| 1.5/20 |
98+
| 3.1--6 |
99+
100+
</td><td></td><td>
101+
102+
| tdp (W) |
103+
|---------|
104+
| 15 |
105+
| 1.5 |
106+
| 3.1 |
107+
108+
</td></tr> </table>
109+
110+
### Default TDP value computation
111+
Default TDP values are computed per intended use of the processors listed in the table.
112+
For this purpose only processors with a `launch date/last time buy` date nor older than the `current year -
113+
10 years` are considered.
114+
The year information can be changed in the `computeDefaultTdps`function. The default TDP values are added to the output csv file as bottom rows.
115+
85116
## Contibuting
86117
### Linting
87118
The linting rules are specified in `.groovylintrc.json` please apply them, when contributing new code.

app/src/main/groovy/org/cpuInfoFetcher/Main.groovy

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,10 @@
11
package org.cpuinfofetcher
22

3-
import org.cpuinfofetcher.utils.Helpers
4-
import org.cpuinfofetcher.utils.UnitsAdapter
5-
import org.dflib.Exp
6-
import org.dflib.Series
7-
import org.dflib.Printers
8-
9-
import static org.dflib.Exp.*
10-
import java.time.LocalDateTime
11-
12-
133
import java.nio.file.Files
144
import java.util.logging.Logger
15-
165
import java.nio.file.Paths
17-
186
import org.dflib.DataFrame
7+
import org.dflib.Printers
198
import org.dflib.JoinType
209
import org.dflib.csv.Csv
2110

@@ -49,6 +38,9 @@ class Main {
4938
'cores': ['Total Cores', '# of CPU Cores', 'cores'],
5039
'threads': ['cores', 'Total Cores', '# of CPU Cores', 'Total Threads', '# of Threads', 'threads']
5140
]
41+
42+
static Map<String, List<String>> specification_aliases_retain_null_entries = ['name': ['name'], "Launch Year/Last Time Buy": ["Launch Year/Last Time Buy"]]
43+
5244
// Mapping units to columns
5345
static Map<String, List<String>> units_mapping = ['tdp': ['W', 'Watt']]
5446

@@ -85,10 +77,6 @@ class Main {
8577
return specifications
8678
}
8779

88-
static DataFrame removeDuplicates(DataFrame specifications) {
89-
return specifications.rows().selectUnique('name')
90-
}
91-
9280

9381

9482
static void main(String[] args) {
@@ -104,26 +92,47 @@ class Main {
10492

10593
// Merging Info into big file
10694
DataFrame specifications = mergeSpecifications(specificationsList)
107-
specifications = removeDuplicates(specifications)
95+
96+
// Remove duplicate rows
97+
specifications = ProcessSpecificationsTable.removeDuplicates(specifications)
98+
99+
// Extract a uniform year for all rows
100+
specifications = ProcessSpecificationsTable.extractUniformYearColumn(specifications)
101+
108102
Csv.save(specifications, Paths.get('..', 'specifications_out', 'specifications.csv'))
109103
this.specifications = specifications
110104
LOGGER.info('Merged all specifications.')
111105

112106
// Selecting relevant information
113107
CPUSpecificationsSummarizer summarizer = new CPUSpecificationsSummarizer()
114-
DataFrame selected_specifications = summarizer.extract_selection(
115-
specifications,
108+
109+
selected_specifications = summarizer.extract_selection(
110+
specifications,
116111
this.specification_aliases,
117112
true
118113
)
114+
115+
// Add "Launch Year/Last Time Buy" column
116+
DataFrame columns_to_add = summarizer.extract_selection(
117+
specifications,
118+
this.specification_aliases_retain_null_entries,
119+
false
120+
)
121+
122+
// Perform Left Join (keeping all rows of selected_specifications)
123+
def selected_specifications = selected_specifications.join(columns_to_add)
124+
.on("name")
125+
.colsExcept(c -> c.endsWith("_"))
126+
.select()
127+
119128
LOGGER.info('Extracted information.')
120129

121130
UnitsAdapter ua = new UnitsAdapter()
122131
selected_specifications = ua.unitToColumnName(selected_specifications, this.units_mapping)
123132
LOGGER.info('Extracted units from data.')
124133

125134
// adjusts format of tdp values to make them uniform
126-
selected_specifications = ua.extractFirstNumber(selected_specifications)
135+
selected_specifications = ProcessSpecificationsTable.extractFirstNumber(selected_specifications)
127136

128137
// add default TDPs
129138
selected_specifications = ProcessSpecificationsTable.computeDefaultTdps(selected_specifications)

app/src/main/groovy/org/cpuInfoFetcher/ProcessSpecificationsTable.groovy

Lines changed: 182 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,94 @@
11
package org.cpuinfofetcher
22
import org.cpuinfofetcher.utils.Helpers
33
import org.dflib.DataFrame
4+
import org.dflib.Printers
45
import java.time.LocalDateTime
56
import static org.dflib.Exp.$col
7+
import java.time.Year
8+
9+
/**
10+
* Adapts columns with units to be more uniform
11+
*/
12+
class UnitsAdapter {
13+
14+
static DataFrame unitToColumnName(DataFrame df, Map<String, String[]> unit_mapping) {
15+
List<String> old_col_names = df.getColumnsIndex().toArray()
16+
// Define new column names with units
17+
List<String> new_col_names = []
18+
for (String col_name : old_col_names) {
19+
List<String> units = unit_mapping.get(col_name)
20+
if (units == null || col_name.endsWith("(${units.get(0)})")) {
21+
new_col_names.add(col_name)
22+
} else {
23+
new_col_names.add("${col_name} (${units.get(0)})")
24+
}
25+
}
26+
27+
// Extract units from values
28+
def new_df = DataFrame.byArrayRow(*new_col_names).appender()
29+
for (int i = 0; i < df.height(); i++) {
30+
List row = []
31+
for (int j = 0; j < df.width(); j++) {
32+
List<String> units = unit_mapping.get(old_col_names.get(j))
33+
String value = df.get(j, i)
34+
if (units != null) {
35+
for (String unit : units) {
36+
value = value.replaceAll("${units.get(0)}", '').replaceAll(' ', '')
37+
}
38+
}
39+
row.add(value)
40+
}
41+
42+
new_df.append(*row)
43+
}
44+
new_df = new_df.toDataFrame()
45+
46+
return new_df
47+
}
48+
49+
}
50+
51+
/**
52+
* Extracts the first numeric value from the 'tdp (W)' column and updates the DataFrame.
53+
*
54+
* Examples of extraction:
55+
* - "15-30" --> 15
56+
* - "1.5/20" --> 1.5
57+
* - "3.1--6" --> 3.1
58+
*
59+
* @param df the input DataFrame
60+
* @return a DataFrame with the updated 'tdp (W)' column containing only the first numeric value
61+
*/
62+
static DataFrame extractFirstNumber(DataFrame df) {
63+
DataFrame old_df = df.cols().selectAs(Map.of("tdp (W)", "tdp old"))
64+
def new_df = DataFrame.empty("tdp (W)")
65+
66+
for (int i = 0; i < old_df.height(); i++) {
67+
String tdp_value = old_df.rows(i).select().get("tdp old", 0)
68+
// Use the matcher to extract the first number
69+
def matcher = tdp_value =~ /^[0-9]*\.?[0-9]+/ // Regex to match the first number (integer or decimal)
70+
def new_tdp_value = matcher.find() ? Double.parseDouble(matcher.group(0)) : null
71+
new_df = new_df.addRow("tdp (W)": new_tdp_value)
72+
73+
}
74+
new_df = old_df.hConcat(new_df).colsExcept('tdp old').select()
75+
76+
return new_df
77+
}
78+
79+
80+
/**
81+
* Removes duplicate rows from the DataFrame based on the 'name' column.
82+
*
83+
* @param specifications the DataFrame to process.
84+
* @return a DataFrame with unique rows by 'name'.
85+
*/
86+
87+
static DataFrame removeDuplicates(DataFrame specifications) {
88+
return specifications.rows().selectUnique('name')
89+
}
90+
91+
692

793
/**
894
* Adds default TDP values to the specifications DataFrame.
@@ -14,7 +100,14 @@ import static org.dflib.Exp.$col
14100
* @return the updated DataFrame with default TDP values
15101
*/
16102
static DataFrame computeDefaultTdps(DataFrame specifications) {
17-
DataFrame aggregatedDf = specifications.group('intended_usage').agg(
103+
int currentYear = Year.now().getValue()
104+
105+
DataFrame specifications_filtered = specifications.rows({ row ->
106+
def year = row.get("Launch Year/Last Time Buy") as Integer
107+
return year != null && year >= currentYear - 10
108+
}).select()
109+
110+
DataFrame aggregatedDf = specifications_filtered.group('intended_usage').agg(
18111
$col('intended_usage').first().as('intended_usage'),
19112
$col('cores').castAsInt().avg().as("avg_cores"),
20113
$col('threads').castAsInt().avg().as("avg_threads"),
@@ -55,4 +148,91 @@ static DataFrame computeDefaultTdps(DataFrame specifications) {
55148
}
56149

57150
return specifications
58-
}
151+
}
152+
153+
154+
155+
static DataFrame extractUniformYearColumn(DataFrame df) {
156+
// Helper method to extract year from various formats
157+
// Match and parse specific patterns
158+
def extractYear = { value ->
159+
if (value == null || value.toString().trim().isEmpty()) {
160+
return null // Handle null or empty input
161+
}
162+
163+
value = value.toString().trim() // Ensure the value is a trimmed string
164+
165+
if (value =~ /^\d{4}$/) { // Matches "2023" (4-digit year)
166+
return value
167+
} else if (value =~ /^\d{1,2}\/\d{1,2}\/\d{2,4}$/) { // Matches "3/22/22" or "03/15/2021"
168+
def parts = value.split("/")
169+
return parts[-1].length() == 2 ? "20${parts[-1]}" : parts[-1] // Handle yy or yyyy
170+
} else if (value =~ /^Q[1-4]'\d{2}$/) { // Matches "Q2'22" (quarter-year format with short year)
171+
return "20" + value[-2..-1]
172+
} else if (value =~ /^Q[1-4]\d{4}$/) { // Matches "Q12021" (quarter-year full format)
173+
return value[-4..-1]
174+
} else if (value =~ /^Q[1-4]\s?\d{2,4}$/) { // Matches "Q217" or "Q2 2026"
175+
def extractedYear = value.replaceAll(/Q[1-4]\s?/, "") // Extract the number after "Q"
176+
return extractedYear.length() == 2 ? "20" + extractedYear : extractedYear
177+
} else if (value =~ /^[1-4]Q\s\d{4}$/) { // Matches "3Q 2016"
178+
return value[-4..-1]
179+
} else if (value =~ /^[A-Za-z]+\s\d{4}$/) { // Matches "June 2017" (Month-Year format)
180+
return value.replaceAll(/[^\d]/, "") // Remove non-digit characters, keep year
181+
} else if (value =~ /^\d{1,2}\/\d{4}$/) { // Matches "06/2017" (MM/YYYY format)
182+
return value.split("/")[1] // Extract year from MM/YYYY format
183+
} else if (value =~ /^[1-4]Q\d{2}$/) { // Matches "2Q18" (Quarter-Year with short year)
184+
return "20" + value[-2..-1]
185+
} else if (value =~ /^\d{2}'\d{2}$/) { // Matches "04'16" (Month-Year short year MM'YY)
186+
return "20" + value[-2..-1] // Extract "16" and convert to "2016"
187+
} else if (value =~ /\b\d{4}\b/) { // Matches all 4-digit years in a string
188+
def matcher = (value =~ /\b\d{4}\b/) // Find all 4-digit years
189+
def allYears = matcher.collect { it.toInteger() } // Collect all matched years as integers
190+
return allYears.min() // Return the earliest year (minimum)
191+
} else {
192+
return null // Return null if no patterns match
193+
}
194+
}
195+
196+
def new_df = DataFrame.empty("Launch Year/Last Time Buy")
197+
198+
// Create a new column that contains the uniform year format
199+
df.rows().select().each { row ->
200+
def launchDate = row.get("Launch Date") // Get value for "Launch Date"
201+
def lastTimeBuy = row.get("Last Time Buy") // Get value for "Last Time Buy"
202+
203+
// Check for "Launch Date" or fall back to "Last Time Buy"
204+
Integer year = extractYear(launchDate ?: lastTimeBuy) as Integer
205+
new_df = new_df.addRow("Launch Year/Last Time Buy": year)
206+
}
207+
//new_df = df.hConcat(new_df).colsExcept("Launch Date", "Last Time Buy").select()
208+
new_df = df.hConcat(new_df)
209+
210+
// Iterate through the rows and print results
211+
for (int i = 0; i < new_df.height(); i++) {
212+
def name = new_df.getColumn("name").get(i) // # Processor name
213+
def launchDate = new_df.getColumn("Launch Date").get(i) // Original "Launch Date"
214+
def lastTimeBuy = new_df.getColumn("Last Time Buy").get(i) // Original "Last Time Buy"
215+
def uniformYear = new_df.getColumn("Launch Year/Last Time Buy").get(i) // New "Uniform Launch Year"
216+
217+
218+
if ((!(launchDate == null || launchDate.toString().trim().isEmpty())
219+
|| !(lastTimeBuy == null || lastTimeBuy.toString().trim().isEmpty()))
220+
&& (uniformYear == null || uniformYear.toString().trim().isEmpty())) {
221+
println "Warning: ${name} -> No valid uniform year found! (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy, Uniform Year: $uniformYear)"
222+
}
223+
224+
//println "Processor: $name, Uniform Year: $uniformYear (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy)"
225+
//if (uniformYear != null) {
226+
// println "Row ${i + 1}: Uniform Year is $uniformYear (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy)"
227+
//} else {
228+
// println "Row ${i + 1}: Warning - No valid value found in 'Launch Date' or 'Last Time Buy'! (Launch Date: $launchDate, Last Time Buy: $lastTimeBuy)"
229+
//}
230+
}
231+
232+
//String table = Printers.tabular.toString(new_df.head(10));
233+
//System.out.println(table);
234+
return new_df
235+
}
236+
237+
238+

0 commit comments

Comments
 (0)