Skip to content

Commit 9b29097

Browse files
committed
Use a ranking query for bot requests (#23863)
* Use a ranking query for bot requests * try using with rollup again * update with rollup implementation after fixes * Fix sorting in database query
1 parent c5ee19b commit 9b29097

13 files changed

Lines changed: 458 additions & 69 deletions

plugins/BotTracking/RecordBuilders/AIAssistantReports.php

Lines changed: 83 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
use Piwik\Plugins\BotTracking\Archiver;
2323
use Piwik\Plugins\BotTracking\Dao\BotRequestsDao;
2424
use Piwik\Plugins\BotTracking\Metrics;
25+
use Piwik\RankingQuery;
2526
use Piwik\Tracker\Action;
2627
use Piwik\Tracker\PageUrl;
2728

@@ -40,13 +41,19 @@ class AIAssistantReports extends RecordBuilder
4041
'Devin' => '',
4142
];
4243

44+
/**
45+
* @var int
46+
*/
47+
private $rankingQueryLimit;
48+
4349
public function __construct()
4450
{
4551
parent::__construct();
4652

4753
$this->columnToSortByBeforeTruncation = Metrics::COLUMN_REQUESTS;
4854
$this->maxRowsInTable = (int)GeneralConfig::getConfigValue('datatable_archiving_maximum_rows_bots');
4955
$this->maxRowsInSubtable = (int)GeneralConfig::getConfigValue('datatable_archiving_maximum_rows_subtable_bots');
56+
$this->rankingQueryLimit = $this->getRankingQueryLimit();
5057
}
5158

5259
public function getRecordMetadata(ArchiveProcessor $archiveProcessor): array
@@ -137,27 +144,7 @@ private function queryAcquiredVisitsByAIAssistant(LogAggregator $logAggregator):
137144
*/
138145
private function populateTableForActionType(array $tables, int $actionType, LogAggregator $logAggregator, array $visits): void
139146
{
140-
$where = $logAggregator->getWhereStatement('bot', 'server_time');
141-
$bindBase = $logAggregator->getGeneralQueryBindParams();
142-
143-
$sql = sprintf(
144-
"SELECT * FROM (SELECT bot.bot_name, log_action.name AS url, COUNT(*) AS requests
145-
FROM %s AS bot
146-
INNER JOIN %s AS log_action ON log_action.idaction = bot.idaction_url
147-
WHERE log_action.name IS NOT NULL
148-
AND log_action.name <> ''
149-
AND log_action.type = %d
150-
AND %s
151-
GROUP BY bot.bot_name, url WITH ROLLUP) AS rollupQuery
152-
ORDER BY bot_name, requests DESC, url",
153-
BotRequestsDao::getPrefixedTableName(),
154-
Common::prefixTable('log_action'),
155-
$actionType,
156-
$where
157-
);
158-
159-
$resultSet = Db::query($sql, $bindBase);
160-
$actionRows = [];
147+
$resultSet = $this->queryBotRequests($logAggregator, $actionType);
161148

162149
while ($row = $resultSet->fetch()) {
163150
/**
@@ -166,45 +153,47 @@ private function populateTableForActionType(array $tables, int $actionType, LogA
166153
$label = $row['bot_name'];
167154
$url = $row['url'];
168155

169-
if (is_null($label)) {
156+
if ($label === null) {
157+
// top-level rollup result
170158
continue;
171159
}
172160

173-
if (!is_null($url)) {
174-
$actionRows[] = $row;
161+
if ($url === null) {
162+
// second-level rollup result
163+
$metrics = [
164+
Metrics::COLUMN_REQUESTS => $row['requests'],
165+
Metrics::COLUMN_DOCUMENT_REQUESTS => $actionType === Action::TYPE_DOWNLOAD ? $row['requests'] : 0,
166+
Metrics::COLUMN_PAGE_REQUESTS => $actionType === Action::TYPE_PAGE_URL ? $row['requests'] : 0,
167+
Metrics::COLUMN_ACQUIRED_VISITS => $visits[$label] ?? 0,
168+
];
169+
170+
$tables[Archiver::AI_ASSISTANTS_PAGES_RECORD]->sumRowWithLabel($label, $metrics, [Metrics::COLUMN_ACQUIRED_VISITS => 'max']);
171+
$tables[Archiver::AI_ASSISTANTS_DOCUMENTS_RECORD]->sumRowWithLabel($label, $metrics, [Metrics::COLUMN_ACQUIRED_VISITS => 'max']);
175172
continue;
176173
}
177174

178-
$metrics = [
179-
Metrics::COLUMN_REQUESTS => $row['requests'],
180-
Metrics::COLUMN_DOCUMENT_REQUESTS => $actionType === Action::TYPE_DOWNLOAD ? $row['requests'] : 0,
181-
Metrics::COLUMN_PAGE_REQUESTS => $actionType === Action::TYPE_PAGE_URL ? $row['requests'] : 0,
182-
Metrics::COLUMN_ACQUIRED_VISITS => $visits[$label] ?? 0,
183-
];
184-
185-
// we add all records to both tables, so we in the end have the total count of pages & documents in the main table
186-
$tables[Archiver::AI_ASSISTANTS_PAGES_RECORD]->sumRowWithLabel($label, $metrics, [Metrics::COLUMN_ACQUIRED_VISITS => 'max']);
187-
$tables[Archiver::AI_ASSISTANTS_DOCUMENTS_RECORD]->sumRowWithLabel($label, $metrics, [Metrics::COLUMN_ACQUIRED_VISITS => 'max']);
188-
}
189175

190-
$table = $tables[Archiver::AI_ASSISTANTS_PAGES_RECORD];
176+
$table = $tables[Archiver::AI_ASSISTANTS_PAGES_RECORD];
191177

192-
if ($actionType === Action::TYPE_DOWNLOAD) {
193-
$table = $tables[Archiver::AI_ASSISTANTS_DOCUMENTS_RECORD];
194-
}
195-
196-
// use while / array_shift combination instead of foreach to save memory
197-
while (is_array($actionRows) && count($actionRows)) {
198-
/**
199-
* @var array{requests: int, bot_name: string, url: string} $row
200-
*/
201-
$row = array_shift($actionRows);
202-
$label = $row['bot_name'];
203-
$url = $row['url'];
178+
if ($actionType === Action::TYPE_DOWNLOAD) {
179+
$table = $tables[Archiver::AI_ASSISTANTS_DOCUMENTS_RECORD];
180+
}
204181

205182
$tableRow = $table->getRowFromLabel($label);
206183

207-
if (empty($tableRow)) {
184+
if (false === $tableRow) {
185+
// non-rollup row but rollup row is missing
186+
// should not happen, but don't break
187+
continue;
188+
}
189+
190+
if (
191+
$url === RankingQuery::LABEL_SUMMARY_ROW
192+
&& !$tableRow->isSubtableLoaded()
193+
) {
194+
// skip creating the subtable if:
195+
// - we are using rollups
196+
// - the only row would be "Others"
208197
continue;
209198
}
210199

@@ -216,4 +205,49 @@ private function populateTableForActionType(array $tables, int $actionType, LogA
216205
]);
217206
}
218207
}
208+
209+
private function queryBotRequests(LogAggregator $logAggregator, int $actionType)
210+
{
211+
$where = $logAggregator->getWhereStatement('bot', 'server_time');
212+
213+
$sql = sprintf(
214+
"SELECT * FROM (SELECT bot.bot_name, log_action.name AS url, COUNT(*) AS requests
215+
FROM %s AS bot
216+
INNER JOIN %s AS log_action ON log_action.idaction = bot.idaction_url
217+
WHERE log_action.name IS NOT NULL
218+
AND log_action.name <> ''
219+
AND log_action.type = %d
220+
AND %s
221+
GROUP BY bot.bot_name, url WITH ROLLUP) AS rollupQuery
222+
ORDER BY requests DESC, bot_name, url",
223+
BotRequestsDao::getPrefixedTableName(),
224+
Common::prefixTable('log_action'),
225+
$actionType,
226+
$where
227+
);
228+
229+
if ($this->rankingQueryLimit > 0) {
230+
$rankingQuery = new RankingQuery($this->rankingQueryLimit);
231+
$rankingQuery->addLabelColumn(['bot_name', 'url']);
232+
$rankingQuery->addColumn('requests', 'sum');
233+
$sql = $rankingQuery->generateRankingQuery($sql, true);
234+
}
235+
236+
return Db::query($sql, $logAggregator->getGeneralQueryBindParams());
237+
}
238+
239+
private function getRankingQueryLimit(): int
240+
{
241+
$maxRowsInTable = (int)$this->maxRowsInTable;
242+
$maxRowsInSubtable = (int)$this->maxRowsInSubtable;
243+
244+
$configLimit = (int)GeneralConfig::getConfigValue('archiving_ranking_query_row_limit');
245+
$configLimit = max($configLimit, 10 * $maxRowsInTable);
246+
247+
if ($configLimit === 0) {
248+
return 0;
249+
}
250+
251+
return max($configLimit, $maxRowsInTable, $maxRowsInSubtable);
252+
}
219253
}

plugins/BotTracking/tests/Fixtures/BotTraffic.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,10 @@ private function trackBotRequests(): void
7777
['ChatGPT-User/1.0', $pages[1], 500, 25896, false],
7878
['ChatGPT-User/1.0', $downloads[1], 200, 33658, true],
7979
['Perplexity-User/1.0', $pages[2], 200, 36985, false],
80+
['Perplexity-User/1.0', $pages[2], 200, 36985, false],
8081
['MistralAI-User/2.0', $pages[3], 200, 85236, false],
8182
['Claude-User/3.0', $downloads[3], 200, 12456, true],
83+
['Claude-User/3.0', $downloads[4], 200, 35562, true],
8284
],
8385
2 => [
8486
['Perplexity-User/1.0', $downloads[3], 200, 84269, true],
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
<?php
2+
3+
/**
4+
* Matomo - free/libre analytics platform
5+
*
6+
* @link https://matomo.org
7+
* @license https://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
8+
*/
9+
10+
declare(strict_types=1);
11+
12+
namespace Piwik\Plugins\BotTracking\tests\System;
13+
14+
use Piwik\Cache;
15+
use Piwik\Config;
16+
use Piwik\Plugins\BotTracking\tests\Fixtures\BotTraffic;
17+
use Piwik\Tests\Framework\TestCase\SystemTestCase;
18+
19+
/**
20+
* @group BotTracking
21+
*/
22+
class RankingQueryApiTest extends SystemTestCase
23+
{
24+
/**
25+
* @var BotTraffic
26+
*/
27+
public static $fixture;
28+
29+
public function testRankingQueryUsesOthersRowPages(): void
30+
{
31+
$generalConfig = &Config::getInstance()->General;
32+
$generalConfig['archiving_ranking_query_row_limit'] = 3;
33+
$generalConfig['datatable_archiving_maximum_rows_bots'] = 0; // no limit here, so we see that the ranking query creates the others row
34+
$generalConfig['datatable_archiving_maximum_rows_subtable_bots'] = 2;
35+
36+
Cache::flushAll();
37+
self::deleteArchiveTables();
38+
39+
$this->runApiTests(['BotTracking.getAIAssistantRequests'], [
40+
'idSite' => 1,
41+
'date' => '2025-02-03',
42+
'periods' => ['day', 'week'],
43+
'otherRequestParameters' => [
44+
'expanded' => 1,
45+
'secondaryDimension' => 'pages',
46+
],
47+
'testSuffix' => 'ranking_limit_pages',
48+
]);
49+
}
50+
51+
public function testRankingQueryUsesOthersRowDocuments(): void
52+
{
53+
$generalConfig = &Config::getInstance()->General;
54+
$generalConfig['archiving_ranking_query_row_limit'] = 3;
55+
$generalConfig['datatable_archiving_maximum_rows_bots'] = 4;
56+
$generalConfig['datatable_archiving_maximum_rows_subtable_bots'] = 2;
57+
58+
Cache::flushAll();
59+
self::deleteArchiveTables();
60+
61+
$this->runApiTests(['BotTracking.getAIAssistantRequests'], [
62+
'idSite' => 1,
63+
'date' => '2025-02-03',
64+
'periods' => ['day', 'week'],
65+
'otherRequestParameters' => [
66+
'expanded' => 1,
67+
'secondaryDimension' => 'documents',
68+
],
69+
'testSuffix' => 'ranking_limit_documents',
70+
]);
71+
}
72+
73+
public static function getOutputPrefix()
74+
{
75+
return '';
76+
}
77+
78+
public static function getPathToTestDirectory()
79+
{
80+
return __DIR__;
81+
}
82+
}
83+
84+
RankingQueryApiTest::$fixture = new BotTraffic();

plugins/BotTracking/tests/System/expected/test__documents__BotTracking.getAIAssistantRequests_day.xml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,22 @@
1515
</row>
1616
<row>
1717
<label>Perplexity-User</label>
18-
<requests>1</requests>
18+
<requests>2</requests>
1919
<document_requests>0</document_requests>
20-
<page_requests>1</page_requests>
20+
<page_requests>2</page_requests>
2121
<visits_acquired>1</visits_acquired>
2222
</row>
2323
<row>
2424
<label>Claude-User</label>
25-
<requests>2</requests>
26-
<document_requests>2</document_requests>
25+
<requests>3</requests>
26+
<document_requests>3</document_requests>
2727
<page_requests>0</page_requests>
2828
<visits_acquired>0</visits_acquired>
2929
<subtable>
30+
<row>
31+
<label>example.com/resources/case-study.pdf</label>
32+
<requests>1</requests>
33+
</row>
3034
<row>
3135
<label>example.com/resources/datasheet.pdf</label>
3236
<requests>1</requests>

plugins/BotTracking/tests/System/expected/test__documents__BotTracking.getAIAssistantRequests_week.xml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,15 @@
22
<result>
33
<row>
44
<label>Claude-User</label>
5-
<requests>4</requests>
6-
<document_requests>2</document_requests>
5+
<requests>5</requests>
6+
<document_requests>3</document_requests>
77
<page_requests>2</page_requests>
88
<visits_acquired>2</visits_acquired>
99
<subtable>
10+
<row>
11+
<label>example.com/resources/case-study.pdf</label>
12+
<requests>1</requests>
13+
</row>
1014
<row>
1115
<label>example.com/resources/datasheet.pdf</label>
1216
<requests>1</requests>
@@ -19,9 +23,9 @@
1923
</row>
2024
<row>
2125
<label>Perplexity-User</label>
22-
<requests>6</requests>
26+
<requests>7</requests>
2327
<document_requests>5</document_requests>
24-
<page_requests>1</page_requests>
28+
<page_requests>2</page_requests>
2529
<visits_acquired>2</visits_acquired>
2630
<subtable>
2731
<row>

plugins/BotTracking/tests/System/expected/test__flat__BotTracking.getAIAssistantRequests_day.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
</row>
2121
<row>
2222
<label>Perplexity-User - example.com/article-3</label>
23-
<requests>1</requests>
23+
<requests>2</requests>
2424
<BotTracking_AIAssistantName>Perplexity-User</BotTracking_AIAssistantName>
2525
<BotTracking_PageUrl>example.com/article-3</BotTracking_PageUrl>
2626
</row>

plugins/BotTracking/tests/System/expected/test__flat__BotTracking.getAIAssistantRequests_week.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
</row>
8181
<row>
8282
<label>Perplexity-User - example.com/article-3</label>
83-
<requests>1</requests>
83+
<requests>2</requests>
8484
<BotTracking_AIAssistantName>Perplexity-User</BotTracking_AIAssistantName>
8585
<BotTracking_PageUrl>example.com/article-3</BotTracking_PageUrl>
8686
</row>

plugins/BotTracking/tests/System/expected/test__pages__BotTracking.getAIAssistantRequests_day.xml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,21 @@
1515
</row>
1616
<row>
1717
<label>Perplexity-User</label>
18-
<requests>1</requests>
18+
<requests>2</requests>
1919
<document_requests>0</document_requests>
20-
<page_requests>1</page_requests>
20+
<page_requests>2</page_requests>
2121
<visits_acquired>1</visits_acquired>
2222
<subtable>
2323
<row>
2424
<label>example.com/article-3</label>
25-
<requests>1</requests>
25+
<requests>2</requests>
2626
</row>
2727
</subtable>
2828
</row>
2929
<row>
3030
<label>Claude-User</label>
31-
<requests>2</requests>
32-
<document_requests>2</document_requests>
31+
<requests>3</requests>
32+
<document_requests>3</document_requests>
3333
<page_requests>0</page_requests>
3434
<visits_acquired>0</visits_acquired>
3535
</row>

0 commit comments

Comments
 (0)