forked from finos/git-proxy
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheckForAiMlUsage.js
More file actions
168 lines (151 loc) · 5.35 KB
/
checkForAiMlUsage.js
File metadata and controls
168 lines (151 loc) · 5.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
const { Step } = require('../../actions');
const path = require('path');
const config = require('../../../config');
const commitConfig = config.getCommitConfig();
const authorizedlist = config.getAuthorisedList();
const fs = require('fs');
// Patterns for detecting different types of AI/ML assets
const FILE_PATTERNS = {
modelWeights: /\.(h5|pb|pt|ckpt|pkl)$/,
// Regex for model weight files like .h5, .pt, .ckpt, or .pkl
largeDatasets: /\.(xlsx)$/,
// Regex for large dataset files
aiLibraries:
/(?:import\s+(tensorflow|torch|keras|sklearn|tokenizer)|require\(['"]tensorflow|torch|keras|sklearn|tokenizer['"]\))/,
// Regex for AI/ML libraries and tokenizers
configKeys: /\b(epochs|learning_rate|batch_size|token)\b/,
// Regex for config keys in JSON/YAML including token-related keys
aiFunctionNames: /\b(train_model|predict|evaluate|fit|transform|tokenize|tokenizer)\b/,
// Regex for AI/ML function/class names with token/tokenizer
};
// Function to check if a file name suggests it is AI/ML related (model weights or dataset)
const isAiMlFileByExtension = (fileName) => {
const checkAiMlConfig = commitConfig.aiMlUsage;
// check file extensions for common model weight files
if (
checkAiMlConfig.blockPatterns.includes('modelWeights') &&
FILE_PATTERNS.modelWeights.test(fileName)
) {
// console.log("FOUND MODEL WEIGHTS");
return true;
}
// check file extensions for large datasets
if (
checkAiMlConfig.blockPatterns.includes('largeDatasets') &&
FILE_PATTERNS.largeDatasets.test(fileName)
) {
// console.log("FOUND LARGE DATASETS");
return true;
}
return false;
};
// Function to check if file content suggests it is AI/ML related
const isAiMlFileByContent = (fileContent) => {
const checkAiMlConfig = commitConfig.aiMlUsage;
// check file content for AI/ML libraries
if (
checkAiMlConfig.blockPatterns.includes('aiLibraries') &&
FILE_PATTERNS.aiLibraries.test(fileContent)
) {
console.warn('FOUND AI LIBRARIES', fileContent);
return true;
}
// check file content for config keys
if (
checkAiMlConfig.blockPatterns.includes('configKeys') &&
FILE_PATTERNS.configKeys.test(fileContent)
) {
// console.log("FOUND CONFIG KEYS");
console.log('configKeys found', fileContent);
return true;
}
// check file content for AI/ML function/class names
if (
checkAiMlConfig.blockPatterns.includes('aiFunctionNames') &&
FILE_PATTERNS.aiFunctionNames.test(fileContent)
) {
console.log('FOUND AI FUNCTION NAMES', fileContent);
return true;
}
return false;
};
// Main function to detect AI/ML usage in an array of file paths
const detectAiMlUsageFiles = async (filePaths, repoRoot) => {
const results = [];
// console.log("filePaths!", filePaths);
for (let filePath of filePaths) {
try {
const fileName = filePath.split('/').pop();
// console.log(fileName, "!!!");
// Check if the file name itself indicates AI/ML usage
if (isAiMlFileByExtension(fileName)) {
console.log('FOUND EXTENSION for ', fileName);
results.push(false);
continue;
// Skip content check if the file name is a match
}
// Check for AI/ML indicators within the file content
// console.log("testing content for ", fileName);
filePath = path.join(repoRoot, filePath);
const content = await fs.promises.readFile(filePath, 'utf8');
if (isAiMlFileByContent(content)) {
results.push(false);
continue;
}
results.push(true); // No indicators found in content
} catch (err) {
console.error(`Error reading file ${filePath}:`, err);
results.push(false); // Treat errors as no AI/ML usage found
}
}
return results;
};
// Helper function to parse file paths from git diff content
const extractFilePathsFromDiff = (diffContent) => {
const filePaths = [];
const lines = diffContent.split('\n');
lines.forEach((line) => {
const match = line.match(/^diff --git a\/(.+?) b\/(.+?)$/);
if (match) {
filePaths.push(match[1]); // Extract the file path from "a/" in the diff line
}
});
return filePaths;
};
// Main exec function
const exec = async (req, action, log = console.log) => {
const diffStep = action.steps.find((s) => s.stepName === 'diff');
const step = new Step('checkForAiMlUsage');
action.addStep(step);
if (!commitConfig.aiMlUsage.enabled) {
// console.log("INSIDW!!")
return action;
}
if (diffStep && diffStep.content) {
const filePaths = extractFilePathsFromDiff(diffStep.content);
// console.log(filePaths);
if (filePaths.length) {
const repoRoot = authorizedlist.find((item) => item.url === action.url).LocalRepoRoot;
const aiMlDetected = await detectAiMlUsageFiles(filePaths, repoRoot);
// console.log(aiMlDetected);
const isBlocked = aiMlDetected.some((found) => {
if (found != undefined) return !found;
else return false;
});
// const isBlocked = false;
if (isBlocked) {
step.blocked = true;
step.error = true;
step.errorMessage = 'Your push has been blocked due to AI/ML usage detection';
log(step.errorMessage);
}
} else {
log('No valid image files found in the diff content.');
}
} else {
log('No diff content available.');
}
return action;
};
exec.displayName = 'checkForAiMlUsage.exec';
module.exports = { exec };