diff --git a/.psalm/baseline.xml b/.psalm/baseline.xml index 3725f50d..4305d6d9 100644 --- a/.psalm/baseline.xml +++ b/.psalm/baseline.xml @@ -15,13 +15,15 @@ $argv - + $option[0] $option[1] $option[1] $option[1] $option[1] $option[1] + $option[1] + $option[1] $directories diff --git a/src/CLI/Application.php b/src/CLI/Application.php index 15b977cc..5a4a6ebd 100644 --- a/src/CLI/Application.php +++ b/src/CLI/Application.php @@ -12,9 +12,13 @@ use const PHP_EOL; use function count; use function printf; +use Exception; use SebastianBergmann\FileIterator\Facade; use SebastianBergmann\PHPCPD\Detector\Detector; +use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; +use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; use SebastianBergmann\PHPCPD\Log\PMD; use SebastianBergmann\PHPCPD\Log\Text; use SebastianBergmann\Timer\ResourceUsageFormatter; @@ -62,17 +66,14 @@ public function run(array $argv): int return 1; } - $strategy = new DefaultStrategy; + $config = new StrategyConfiguration($arguments); + + $strategy = $this->pickStrategy($arguments->algorithm(), $config); $timer = new Timer; $timer->start(); - $clones = (new Detector($strategy))->copyPasteDetection( - $files, - $arguments->linesThreshold(), - $arguments->tokensThreshold(), - $arguments->fuzzy() - ); + $clones = (new Detector($strategy))->copyPasteDetection($files); (new Text)->printResult($clones, $arguments->verbose()); @@ -93,6 +94,21 @@ private function printVersion(): void ); } + private function pickStrategy(?string $algorithm, StrategyConfiguration $config): AbstractStrategy + { + switch ($algorithm) { + case null: + case 'rabin-karp': + return new DefaultStrategy($config); + + case 'suffixtree': + return new SuffixTreeStrategy($config); + + default: + throw new Exception('Unsupported algorithm: ' . $algorithm); + } + } + private function help(): void { print <<<'EOT' @@ -108,9 +124,12 @@ private function help(): void Options for analysing files: - --fuzzy Fuzz variable names - --min-lines Minimum number of identical lines (default: 5) - --min-tokens Minimum number of identical tokens (default: 70) + --fuzzy Fuzz variable names + --min-lines Minimum number of identical lines (default: 5) + --min-tokens Minimum number of identical tokens (default: 70) + --algorithm Select which algorithm to use ('rabin-karp' (default) or 'suffixtree') + --edit-distance Distance in number of edits between two clones (only for suffixtree; default: 5) + --head-equality Minimum equality at start of clone (only for suffixtree; default 10) Options for report generation: diff --git a/src/CLI/Arguments.php b/src/CLI/Arguments.php index cbe4f296..112aca2a 100644 --- a/src/CLI/Arguments.php +++ b/src/CLI/Arguments.php @@ -61,7 +61,22 @@ final class Arguments */ private $version; - public function __construct(array $directories, array $suffixes, array $exclude, ?string $pmdCpdXmlLogfile, int $linesThreshold, int $tokensThreshold, bool $fuzzy, bool $verbose, bool $help, bool $version) + /** + * @var ?string + */ + private $algorithm; + + /** + * @var int + */ + private $editDistance; + + /** + * @var int + */ + private $headEquality; + + public function __construct(array $directories, array $suffixes, array $exclude, ?string $pmdCpdXmlLogfile, int $linesThreshold, int $tokensThreshold, bool $fuzzy, bool $verbose, bool $help, bool $version, ?string $algorithm, int $editDistance, int $headEquality) { $this->directories = $directories; $this->suffixes = $suffixes; @@ -73,6 +88,9 @@ public function __construct(array $directories, array $suffixes, array $exclude, $this->verbose = $verbose; $this->help = $help; $this->version = $version; + $this->algorithm = $algorithm; + $this->editDistance = $editDistance; + $this->headEquality = $headEquality; } /** @@ -133,4 +151,19 @@ public function version(): bool { return $this->version; } + + public function algorithm(): ?string + { + return $this->algorithm; + } + + public function editDistance(): int + { + return $this->editDistance; + } + + public function headEquality(): int + { + return $this->headEquality; + } } diff --git a/src/CLI/ArgumentsBuilder.php b/src/CLI/ArgumentsBuilder.php index c7dfa0b5..a92c5a52 100644 --- a/src/CLI/ArgumentsBuilder.php +++ b/src/CLI/ArgumentsBuilder.php @@ -30,9 +30,12 @@ public function build(array $argv): Arguments 'fuzzy', 'min-lines=', 'min-tokens=', + 'head-equality=', + 'edit-distance=', 'verbose', 'help', 'version', + 'algorithm=', ] ); } catch (CliParserException $e) { @@ -49,10 +52,13 @@ public function build(array $argv): Arguments $pmdCpdXmlLogfile = null; $linesThreshold = 5; $tokensThreshold = 70; + $editDistance = 5; + $headEquality = 10; $fuzzy = false; $verbose = false; $help = false; $version = false; + $algorithm = 'rabin-karp'; foreach ($options[0] as $option) { switch ($option[0]) { @@ -86,6 +92,16 @@ public function build(array $argv): Arguments break; + case '--head-equality': + $headEquality = (int) $option[1]; + + break; + + case '--edit-distance': + $editDistance = (int) $option[1]; + + break; + case '--verbose': $verbose = true; @@ -101,6 +117,11 @@ public function build(array $argv): Arguments case '--version': $version = true; + break; + + case '--algorithm': + $algorithm = (string) $option[1]; + break; } } @@ -122,6 +143,9 @@ public function build(array $argv): Arguments $verbose, $help, $version, + $algorithm, + $editDistance, + $headEquality ); } } diff --git a/src/Detector/Detector.php b/src/Detector/Detector.php index 9ccffeee..a9acbe40 100644 --- a/src/Detector/Detector.php +++ b/src/Detector/Detector.php @@ -24,7 +24,7 @@ public function __construct(AbstractStrategy $strategy) $this->strategy = $strategy; } - public function copyPasteDetection(iterable $files, int $minLines = 5, int $minTokens = 70, bool $fuzzy = false): CodeCloneMap + public function copyPasteDetection(iterable $files): CodeCloneMap { $result = new CodeCloneMap; @@ -35,13 +35,12 @@ public function copyPasteDetection(iterable $files, int $minLines = 5, int $minT $this->strategy->processFile( $file, - $minLines, - $minTokens, - $result, - $fuzzy + $result ); } + $this->strategy->postProcess(); + return $result; } } diff --git a/src/Detector/Strategy/AbstractStrategy.php b/src/Detector/Strategy/AbstractStrategy.php index 2efd672b..ff6bb4d9 100644 --- a/src/Detector/Strategy/AbstractStrategy.php +++ b/src/Detector/Strategy/AbstractStrategy.php @@ -37,10 +37,21 @@ abstract class AbstractStrategy T_NS_SEPARATOR => true, ]; - /** - * @psalm-var array - */ - protected $hashes = []; + protected $config; + + public function __construct(StrategyConfiguration $config) + { + $this->config = $config; + } + + public function setConfig(StrategyConfiguration $config): void + { + $this->config = $config; + } + + abstract public function processFile(string $file, CodeCloneMap $result): void; - abstract public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void; + public function postProcess(): void + { + } } diff --git a/src/Detector/Strategy/DefaultStrategy.php b/src/Detector/Strategy/DefaultStrategy.php index 014a33dd..7a90dd05 100644 --- a/src/Detector/Strategy/DefaultStrategy.php +++ b/src/Detector/Strategy/DefaultStrategy.php @@ -25,9 +25,26 @@ use SebastianBergmann\PHPCPD\CodeCloneFile; use SebastianBergmann\PHPCPD\CodeCloneMap; +/** + * This is a Rabin-Karp with an additional normalization steps before + * the hashing happens. + * + * 1. Tokenization + * 2. Deletion of logic neutral tokens like T_CLOSE_TAG;T_COMMENT; + * T_DOC_COMMENT; T_INLINE_HTML; T_NS_SEPARATOR; T_OPEN_TAG; + * T_OPEN_TAG_WITH_ECHO; T_USE; T_WHITESPACE; + * 3. If needed deletion of variable names + * 4. Normalization of token + value using crc32 + * 5. Now the classic Rabin-Karp hashing takes place + */ final class DefaultStrategy extends AbstractStrategy { - public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void + /** + * @psalm-var array + */ + protected $hashes = []; + + public function processFile(string $file, CodeCloneMap $result): void { $buffer = file_get_contents($file); $currentTokenPositions = []; @@ -55,7 +72,7 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo $currentTokenRealPositions[$tokenNr++] = $token[2]; - if ($fuzzy && $token[0] === T_VARIABLE) { + if ($this->config->getFuzzy() && $token[0] === T_VARIABLE) { $token[1] = 'variable'; } @@ -73,7 +90,7 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo $found = false; $tokenNr = 0; - while ($tokenNr <= $count - $minTokens) { + while ($tokenNr <= $count - $this->config->getMinTokens()) { $line = $currentTokenPositions[$tokenNr]; $realLine = $currentTokenRealPositions[$tokenNr]; @@ -82,7 +99,7 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo substr( $currentSignature, $tokenNr * 5, - $minTokens * 5 + $this->config->getMinTokens() * 5 ), true ), @@ -103,13 +120,13 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo if ($found) { $fileA = $this->hashes[$firstHash][0]; $firstLineA = $this->hashes[$firstHash][1]; - $lastToken = ($tokenNr - 1) + $minTokens - 1; + $lastToken = ($tokenNr - 1) + $this->config->getMinTokens() - 1; $lastLine = $currentTokenPositions[$lastToken]; $lastRealLine = $currentTokenRealPositions[$lastToken]; $numLines = $lastLine + 1 - $firstLine; $realNumLines = $lastRealLine + 1 - $firstRealLine; - if ($numLines >= $minLines && + if ($numLines >= $this->config->getMinLines() && ($fileA !== $file || $firstLineA !== $firstRealLine)) { $result->add( @@ -135,13 +152,13 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo if ($found) { $fileA = $this->hashes[$firstHash][0]; $firstLineA = $this->hashes[$firstHash][1]; - $lastToken = ($tokenNr - 1) + $minTokens - 1; + $lastToken = ($tokenNr - 1) + $this->config->getMinTokens() - 1; $lastLine = $currentTokenPositions[$lastToken]; $lastRealLine = $currentTokenRealPositions[$lastToken]; $numLines = $lastLine + 1 - $firstLine; $realNumLines = $lastRealLine + 1 - $firstRealLine; - if ($numLines >= $minLines && + if ($numLines >= $this->config->getMinLines() && ($fileA !== $file || $firstLineA !== $firstRealLine)) { $result->add( new CodeClone( diff --git a/src/Detector/Strategy/StrategyConfiguration.php b/src/Detector/Strategy/StrategyConfiguration.php new file mode 100644 index 00000000..370b9b89 --- /dev/null +++ b/src/Detector/Strategy/StrategyConfiguration.php @@ -0,0 +1,91 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy; + +use SebastianBergmann\PHPCPD\Arguments; + +/** + * Small DTO to carry configuration for a strategy. + * Different algorithms have different configs available. + */ +final class StrategyConfiguration +{ + /** + * Minimum lines to consider. + * + * @var int + */ + private $minLines = 5; + + /** + * Minimum tokens to consider in a clone. + * + * @var int + */ + private $minTokens = 70; + + /** + * Edit distance to consider when comparing two clones + * Only available for the suffix-tree algorithm. + * + * @var int + */ + private $editDistance = 5; + + /** + * Tokens that must be equal to consider a clone + * Only available for the suffix-tree algorithm. + * + * @var int + */ + private $headEquality = 10; + + /** + * Fuzz variable names + * suffixtree always makes variables and functions fuzzy. + * + * @var bool + */ + private $fuzzy = false; + + public function __construct(Arguments $arguments) + { + $this->minLines = $arguments->linesThreshold(); + $this->minTokens = $arguments->tokensThreshold(); + $this->fuzzy = $arguments->fuzzy(); + $this->editDistance = $arguments->editDistance(); + $this->headEquality = $arguments->headEquality(); + } + + public function getMinLines(): int + { + return $this->minLines; + } + + public function getMinTokens(): int + { + return $this->minTokens; + } + + public function getFuzzy(): bool + { + return $this->fuzzy; + } + + public function getHeadEquality(): int + { + return $this->headEquality; + } + + public function getEditDistance(): int + { + return $this->editDistance; + } +} diff --git a/src/Detector/Strategy/SuffixTree/AbstractToken.php b/src/Detector/Strategy/SuffixTree/AbstractToken.php new file mode 100644 index 00000000..96425032 --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/AbstractToken.php @@ -0,0 +1,34 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; + +abstract class AbstractToken +{ + /** @var int */ + public $tokenCode; + + /** @var int */ + public $line; + + /** @var string */ + public $file; + + /** @var string */ + public $tokenName; + + /** @var string */ + public $content; + + abstract public function __toString(): string; + + abstract public function hashCode(): int; + + abstract public function equals(self $obj): bool; +} diff --git a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php new file mode 100644 index 00000000..896fb4db --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php @@ -0,0 +1,551 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; + +/** + * An extension of the suffix tree adding an algorithm for finding approximate + * clones, i.e. substrings which are similar. + * + * @author $Author: hummelb $ + * + * @version $Revision: 43151 $ + * @ConQAT.Rating GREEN Hash: BB94CD690760BC239F04D32D5BCAC33E + */ +class ApproximateCloneDetectingSuffixTree extends SuffixTree +{ + /** + * The minimal length of clones to return. + * + * @var int + */ + protected $minLength = 70; + + /** + * The number of leaves reachable from the given node (1 for leaves). + * + * @var int[] + * */ + private $leafCount = []; + + /** + * This is the distance between two entries in the {@link #cloneInfos} map. + * + * @var int + */ + private $INDEX_SPREAD = 10; + + /** + * This map stores for each position the relevant clone infos. + * + * @var array + */ + private $cloneInfos = []; + + /** + * The maximal length of a clone. This influences the size of the + * (quadratic) {@link #edBuffer}. + * + * @var int + */ + private $MAX_LENGTH = 1024; + + /** + * Buffer used for calculating edit distance. + * + * @var array + */ + private $edBuffer = []; + + /** + * Number of units that must be equal at the start of a clone. + * + * @var int + */ + private $headEquality = 10; + + /** + * Create a new suffix tree from a given word. The word given as parameter + * is used internally and should not be modified anymore, so copy it before + * if required. + *

+ * This only word correctly if the given word is closed using a sentinel + * character. + * + * @param AbstractToken[] $word List of tokens to analyze + */ + public function __construct(array $word) + { + parent::__construct($word); + + $arr = array_fill(0, $this->MAX_LENGTH, 0); + $this->edBuffer = array_fill(0, $this->MAX_LENGTH, $arr); + $this->ensureChildLists(); + $this->leafCount = array_fill(0, $this->numNodes, 0); + $this->initLeafCount(0); + } + + /** + * @todo Add options: + * --min-tokens + * --min-lines + * --edit-distance + * @todo Possibly add consumer from original code. + */ + + /** + * Finds all clones in the string (List) used in the constructor. + * + * @param int $minLength the minimal length of a clone in tokens (not lines) + * @param int $maxErrors the maximal number of errors/gaps allowed + * @param int $headEquality the number of elements which have to be the same at the beginning of a clone + * + * @return CloneInfo[] + */ + public function findClones(int $minLength, int $maxErrors, int $headEquality): array + { + $this->minLength = $minLength; + $this->headEquality = $headEquality; + $this->cloneInfos = []; + + for ($i = 0; $i < count($this->word); $i++) { + // Do quick start, as first character has to match anyway. + $node = $this->nextNode->get(0, $this->word[$i]); + + if ($node < 0 || $this->leafCount[$node] <= 1) { + continue; + } + + // we know that we have an exact match of at least 'length' + // characters, as the word itself is part of the suffix tree. + $length = $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node]; + $numReported = 0; + + for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { + if ($this->matchWord( + $i, + $i + $length, + $this->nodeChildNode[$e], + $length, + $maxErrors + )) { + $numReported++; + } + } + + if ($length >= $this->minLength && $numReported != 1) { + $this->reportClone($i, $i + $length, $node, $length, $length); + } + } + + $map = []; + + for ($index = 0; $index <= count($this->word); $index++) { + /** @var CloneInfo[] */ + $existingClones = $this->cloneInfos[$index] ?? null; + + if (!empty($existingClones)) { + foreach ($existingClones as $ci) { + // length = number of tokens + // TODO: min token length + if ($ci->length > $minLength) { + $previousCi = $map[$ci->token->line] ?? null; + + if ($previousCi === null) { + $map[$ci->token->line] = $ci; + } elseif ($ci->length > $previousCi->length) { + $map[$ci->token->line] = $ci; + } + } + } + } + } + + /** @var CloneInfo[] */ + $values = array_values($map); + usort($values, static function (CloneInfo $a, CloneInfo $b): int { + return $b->length - $a->length; + }); + + return $values; + } + + /** + * This should return true, if the provided character is not allowed to + * match with anything else (e.g. is a sentinel). + */ + protected function mayNotMatch(AbstractToken $token): bool + { + return $token instanceof Sentinel; + } + + /** + * This method is called whenever the {@link #MAX_LENGTH} is to small and + * hence the {@link #edBuffer} was not large enough. This may cause that a + * really large clone is reported in multiple chunks of size + * {@link #MAX_LENGTH} and potentially minor parts of such a clone might be + * lost. + */ + protected function reportBufferShortage(int $leafStart, int $leafLength): void + { + print 'Encountered buffer shortage: ' . $leafStart . ' ' . $leafLength . "\n"; + } + + /** + * Initializes the {@link #leafCount} array which given for each node the + * number of leaves reachable from it (where leaves obtain a value of 1). + */ + private function initLeafCount(int $node): void + { + $this->leafCount[$node] = 0; + + for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { + $this->initLeafCount($this->nodeChildNode[$e]); + $this->leafCount[$node] += $this->leafCount[$this->nodeChildNode[$e]]; + } + + if ($this->leafCount[$node] == 0) { + $this->leafCount[$node] = 1; + } + } + + /** + * Performs the approximative matching between the input word and the tree. + * + * @param int $wordStart the start position of the currently matched word (position in + * the input word) + * @param int $wordPosition the current position along the input word + * @param int $node the node we are currently at (i.e. the edge leading to this + * node is relevant to us). + * @param int $nodeWordLength the length of the word found along the nodes (this may be + * different from the length along the input word due to gaps) + * @param int $maxErrors the number of errors still allowed + * + * @return bool whether some clone was reported + */ + private function matchWord(int $wordStart, int $wordPosition, int $node, int $nodeWordLength, int $maxErrors) + { + // We are aware that this method is longer than desirable for code + // reading. However, we currently do not see a refactoring that has a + // sensible cost-benefit ratio. Suggestions are welcome! + + // self match? + if ($this->leafCount[$node] == 1 && $this->nodeWordBegin[$node] == $wordPosition) { + return false; + } + + $currentNodeWordLength = min($this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $this->MAX_LENGTH - 1); + + // Do min edit distance + $currentLength = $this->calculateMaxLength( + $wordStart, + $wordPosition, + $node, + $maxErrors, + $currentNodeWordLength + ); + + if ($currentLength == 0) { + return false; + } + + if ($currentLength >= $this->MAX_LENGTH - 1) { + $this->reportBufferShortage($this->nodeWordBegin[$node], $currentNodeWordLength); + } + + // calculate cheapest match + $best = $maxErrors + 42; + $iBest = 0; + $jBest = 0; + + for ($k = 0; $k <= $currentLength; $k++) { + $i = $currentLength - $k; + $j = $currentLength; + + if ($this->edBuffer[$i][$j] < $best) { + $best = $this->edBuffer[$i][$j]; + $iBest = $i; + $jBest = $j; + } + + $i = $currentLength; + $j = $currentLength - $k; + + if ($this->edBuffer[$i][$j] < $best) { + $best = $this->edBuffer[$i][$j]; + $iBest = $i; + $jBest = $j; + } + } + + while ($wordPosition + $iBest < count($this->word) && + $jBest < $currentNodeWordLength && + $this->word[$wordPosition + $iBest] != $this->word[$this->nodeWordBegin[$node] + $jBest] && + $this->word[$wordPosition + $iBest]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest] + )) { + $iBest++; + $jBest++; + } + + $numReported = 0; + + if ($currentLength == $currentNodeWordLength) { + // we may proceed + for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { + if ($this->matchWord( + $wordStart, + $wordPosition + $iBest, + $this->nodeChildNode[$e], + $nodeWordLength + $jBest, + $maxErrors + - $best + )) { + $numReported++; + } + } + } + + // do not report locally if had reports in exactly one subtree (would be + // pure subclone) + if ($numReported == 1) { + return true; + } + + // disallow tail changes + while ($iBest > 0 && + $jBest > 0 && + !$this->word[$wordPosition + $iBest - 1]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest - 1] + )) { + if ($iBest > 1 && + $this->word[$wordPosition + $iBest - 2]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest - 1] + )) { + $iBest--; + } elseif ($jBest > 1 && + $this->word[$wordPosition + $iBest - 1]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest - 2] + )) { + $jBest--; + } else { + $iBest--; + $jBest--; + } + } + + // report if real clone + if ($iBest > 0 && $jBest > 0) { + $numReported++; + $this->reportClone($wordStart, $wordPosition + $iBest, $node, $jBest, $nodeWordLength + $jBest); + } + + return $numReported > 0; + } + + /** + * Calculates the maximum length we may take along the word to the current + * $node (respecting the number of errors to make). *. + * + * @param int $wordStart the start position of the currently matched word (position in + * the input word) + * @param int $wordPosition the current position along the input word + * @param int $node the node we are currently at (i.e. the edge leading to this + * node is relevant to us). + * @param int $maxErrors the number of errors still allowed + * @param int $currentNodeWordLength the length of the word found along the nodes (this may be + * different from the actual length due to buffer limits) + * + * @return int the maximal length that can be taken + */ + private function calculateMaxLength( + int $wordStart, + int $wordPosition, + int $node, + int $maxErrors, + int $currentNodeWordLength + ) { + $this->edBuffer[0][0] = 0; + $currentLength = 1; + + for (; $currentLength <= $currentNodeWordLength; $currentLength++) { + /** @var int */ + $best = $currentLength; + $this->edBuffer[0][$currentLength] = $currentLength; + $this->edBuffer[$currentLength][0] = $currentLength; + + if ($wordPosition + $currentLength >= count($this->word)) { + break; + } + + // deal with case that character may not be matched (sentinel!) + $iChar = $this->word[$wordPosition + $currentLength - 1]; + $jChar = $this->word[$this->nodeWordBegin[$node] + $currentLength - 1]; + + if ($this->mayNotMatch($iChar) || $this->mayNotMatch($jChar)) { + break; + } + + // usual matrix completion for edit distance + for ($k = 1; $k < $currentLength; $k++) { + $best = min( + $best, + $this->fillEDBuffer( + $k, + $currentLength, + $wordPosition, + $this->nodeWordBegin[$node] + ) + ); + } + + for ($k = 1; $k < $currentLength; $k++) { + $best = min( + $best, + $this->fillEDBuffer( + $currentLength, + $k, + $wordPosition, + $this->nodeWordBegin[$node] + ) + ); + } + $best = min( + $best, + $this->fillEDBuffer( + $currentLength, + $currentLength, + $wordPosition, + $this->nodeWordBegin[$node] + ) + ); + + if ($best > $maxErrors || + $wordPosition - $wordStart + $currentLength <= $this->headEquality && + $best > 0) { + break; + } + } + $currentLength--; + + return $currentLength; + } + + private function reportClone( + int $wordBegin, + int $wordEnd, + int $currentNode, + int $nodeWordPos, + int $nodeWordLength + ): void { + $length = $wordEnd - $wordBegin; + + if ($length < $this->minLength || $nodeWordLength < $this->minLength) { + return; + } + + // NB: 0 and 0 are two indicate the template S and T for Psalm, in lack of generics. + $otherClones = new PairList(16, 0, 0); + $this->findRemainingClones( + $otherClones, + $nodeWordLength, + $currentNode, + $this->nodeWordEnd[$currentNode] - $this->nodeWordBegin[$currentNode] - $nodeWordPos, + $wordBegin + ); + + $occurrences = 1 + $otherClones->size(); + + // check whether we may start from here + $t = $this->word[$wordBegin]; + $newInfo = new CloneInfo($length, $wordBegin, $occurrences, $t, $otherClones); + + for ($index = max(0, $wordBegin - $this->INDEX_SPREAD + 1); $index <= $wordBegin; $index++) { + $existingClones = $this->cloneInfos[$index] ?? null; + + if ($existingClones != null) { + //for (CloneInfo cloneInfo : $existingClones) { + foreach ($existingClones as $cloneInfo) { + if ($cloneInfo->dominates($newInfo, $wordBegin - $index)) { + // we already have a dominating clone, so ignore + return; + } + } + } + } + + // add clone to $otherClones to avoid getting more duplicates + for ($i = $wordBegin; $i < $wordEnd; $i += $this->INDEX_SPREAD) { + $this->cloneInfos[$i][] = new CloneInfo($length - ($i - $wordBegin), $wordBegin, $occurrences, $t, $otherClones); + } + $t = $this->word[$wordBegin]; + + for ($clone = 0; $clone < $otherClones->size(); $clone++) { + $start = $otherClones->getFirst($clone); + $otherLength = $otherClones->getSecond($clone); + + for ($i = 0; $i < $otherLength; $i += $this->INDEX_SPREAD) { + $this->cloneInfos[$start + $i][] = new CloneInfo($otherLength - $i, $wordBegin, $occurrences, $t, $otherClones); + } + } + } + + /** + * Fills the edit distance buffer at position (i,j). + * + * @param int $i the first index of the buffer + * @param int $j the second index of the buffer + * @param int $iOffset the offset where the word described by $i starts + * @param int $jOffset the offset where the word described by $j starts + * + * @return int the value inserted into the buffer + */ + private function fillEDBuffer(int $i, int $j, int $iOffset, int $jOffset) + { + $iChar = $this->word[$iOffset + $i - 1]; + $jChar = $this->word[$jOffset + $j - 1]; + + $insertDelete = 1 + min($this->edBuffer[$i - 1][$j], $this->edBuffer[$i][$j - 1]); + $change = $this->edBuffer[$i - 1][$j - 1] + ($iChar->equals($jChar) ? 0 : 1); + + return $this->edBuffer[$i][$j] = min($insertDelete, $change); + } + + /** + * Fills a list of pairs giving the start positions and lengths of the + * remaining clones. + * + * @param PairList $clonePositions the clone positions being filled (start position and length) + * @param int $nodeWordLength the length of the word along the nodes + * @param int $currentNode the node we are currently at + * @param int $distance the distance along the word leading to the current node + * @param int $wordStart the start of the currently searched word + */ + private function findRemainingClones( + PairList $clonePositions, + int $nodeWordLength, + int $currentNode, + int $distance, + int $wordStart + ): void { + for ($nextNode = $this->nodeChildFirst[$currentNode]; $nextNode >= 0; $nextNode = $this->nodeChildNext[$nextNode]) { + $node = $this->nodeChildNode[$nextNode]; + $this->findRemainingClones($clonePositions, $nodeWordLength, $node, $distance + + $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $wordStart); + } + + if ($this->nodeChildFirst[$currentNode] < 0) { + $start = count($this->word) - $distance - $nodeWordLength; + + if ($start != $wordStart) { + $clonePositions->add($start, $nodeWordLength); + } + } + } +} diff --git a/src/Detector/Strategy/SuffixTree/CloneInfo.php b/src/Detector/Strategy/SuffixTree/CloneInfo.php new file mode 100644 index 00000000..4187996c --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/CloneInfo.php @@ -0,0 +1,68 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; + +/** Stores information on a clone. */ +class CloneInfo +{ + /** + * Length of the clone in tokens. + * + * @var int + */ + public $length; + + /** + * Position in word list. + * + * @var int + */ + public $position; + + /** + * @var AbstractToken + */ + public $token; + + /** + * Related clones. + * + * @var PairList + */ + public $otherClones; + + /** + * Number of occurrences of the clone. + * + * @var int + */ + private $occurrences; + + /** Constructor. */ + public function __construct(int $length, int $position, int $occurrences, AbstractToken $token, PairList $otherClones) + { + $this->length = $length; + $this->position = $position; + $this->occurrences = $occurrences; + $this->token = $token; + $this->otherClones = $otherClones; + } + + /** + * Returns whether this clone info dominates the given one, i.e. whether + * both {@link #length} and {@link #occurrences} s not smaller. + * + * @param later the amount the given clone starts later than the "this" clone + */ + public function dominates(self $ci, int $later): bool + { + return $this->length - $later >= $ci->length && $this->occurrences >= $ci->occurrences; + } +} diff --git a/src/Detector/Strategy/SuffixTree/PairList.php b/src/Detector/Strategy/SuffixTree/PairList.php new file mode 100644 index 00000000..c0b851ec --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/PairList.php @@ -0,0 +1,227 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; + +use Exception; + +/** + * A list for storing pairs in a specific order. + * + * @author $Author: hummelb $ + * + * @version $Rev: 51770 $ + * @ConQAT.Rating GREEN Hash: 7459D6D0F59028B37DD23DD091BDCEEA + * + * @template T + * @template S + */ +class PairList +{ + /** + * Version used for serialization. + * + * @var int + */ + private $serialVersionUID = 1; + + /** + * The current size. + * + * @var int + */ + private $size = 0; + + /** + * The array used for storing the S. + * + * @var S[] + */ + private $firstElements; + + /** + * The array used for storing the T. + * + * @var T[] + */ + private $secondElements; + + /** + * @param S $firstType + * @param T $secondType + */ + public function __construct(int $initialCapacity, $firstType, $secondType) + { + if ($initialCapacity < 1) { + $initialCapacity = 1; + } + $this->firstElements = array_fill(0, $initialCapacity, null); + $this->secondElements = array_fill(0, $initialCapacity, null); + } + + /** Returns whether the list is empty. */ + public function isEmpty(): bool + { + return $this->size == 0; + } + + /** Returns the size of the list. */ + public function size(): int + { + return $this->size; + } + + /** + * Add the given pair to the list. + * + * @param S $first + * @param T $second + */ + public function add($first, $second): void + { + $this->firstElements[$this->size] = $first; + $this->secondElements[$this->size] = $second; + $this->size++; + } + + /** Adds all pairs from another list. */ + public function addAll(self $other): void + { + // we have to store this in a local var, as other.$this->size may change if + // other == this + $otherSize = $other->size; + + for ($i = 0; $i < $otherSize; $i++) { + $this->firstElements[$this->size] = $other->firstElements[$i]; + $this->secondElements[$this->size] = $other->secondElements[$i]; + $this->size++; + } + } + + /** + * Returns the first element at given index. + * + * @return S + */ + public function getFirst(int $i) + { + $this->checkWithinBounds($i); + + return $this->firstElements[$i]; + } + + /** + * Sets the first element at given index. + * + * @param S $value + */ + public function setFirst(int $i, $value): void + { + $this->checkWithinBounds($i); + $this->firstElements[$i] = $value; + } + + /** + * Returns the second element at given index. + * + * @return T + */ + public function getSecond(int $i) + { + $this->checkWithinBounds($i); + + return $this->secondElements[$i]; + } + + /** + * Sets the first element at given index. + * + * @param T $value + */ + public function setSecond(int $i, $value): void + { + $this->checkWithinBounds($i); + $this->secondElements[$i] = $value; + } + + /** + * Creates a new list containing all first elements. + * + * @return S[] + */ + public function extractFirstList(): array + { + $result = []; + + for ($i = 0; $i < $this->size; $i++) { + $result[] = $this->firstElements[$i]; + } + + return $result; + } + + /** + * Creates a new list containing all second elements. + * + * @return T[] + */ + public function extractSecondList(): array + { + $result = []; + + for ($i = 0; $i < $this->size; $i++) { + $result[] = $this->secondElements[$i]; + } + + return $result; + } + + /** Swaps the entries located at indexes $i and $j. */ + public function swapEntries(int $i, int $j): void + { + $tmp1 = $this->getFirst($i); + $tmp2 = $this->getSecond($i); + $this->setFirst($i, $this->getFirst($j)); + $this->setSecond($i, $this->getSecond($j)); + $this->setFirst($j, $tmp1); + $this->setSecond($j, $tmp2); + } + + /** Clears this list. */ + public function clear(): void + { + $this->size = 0; + } + + /** Removes the last element of the list. */ + public function removeLast(): void + { + $this->size--; + } + + public function hashCode(): int + { + $prime = 31; + $hash = $this->size; + $hash = $prime * $hash + crc32(serialize($this->firstElements)); + + return $prime * $hash + crc32(serialize($this->secondElements)); + } + + /** + * Checks whether the given $i is within the bounds. Throws an + * exception otherwise. + */ + private function checkWithinBounds(int $i): void + { + if ($i < 0 || $i >= $this->size) { + throw new Exception('Out of bounds: ' . $i); + } + } +} diff --git a/src/Detector/Strategy/SuffixTree/Sentinel.php b/src/Detector/Strategy/SuffixTree/Sentinel.php new file mode 100644 index 00000000..ad241485 --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/Sentinel.php @@ -0,0 +1,48 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; + +/** + * A sentinel character which can be used to produce explicit leaves for all + * suffixes. The sentinel just has to be appended to the list before handing + * it to the suffix tree. For the sentinel equality and object identity are + * the same! + */ +class Sentinel extends AbstractToken +{ + /** @var int The hash value used. */ + private $hash; + + public function __construct() + { + $this->hash = rand(0, PHP_INT_MAX); + $this->tokenCode = -1; + $this->line = -1; + $this->file = ''; + $this->tokenName = ''; + $this->content = ''; + } + + public function __toString(): string + { + return '$'; + } + + public function hashCode(): int + { + return $this->hash; + } + + public function equals(AbstractToken $obj): bool + { + // Original code uses physical object equality, not present in PHP. + return $obj instanceof self; + } +} diff --git a/src/Detector/Strategy/SuffixTree/SuffixTree.php b/src/Detector/Strategy/SuffixTree/SuffixTree.php new file mode 100644 index 00000000..73085e13 --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/SuffixTree.php @@ -0,0 +1,314 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; + +/** + * Efficient linear time constructible suffix tree using Ukkonen's online + * construction algorithm (E. Ukkonen: "On-line construction of suffix trees"). + * Most of the comments reference this paper and it might be hard to follow + * without knowing at least the basics of it. + *

+ * We use some conventions which are slightly different from the paper however: + *

    + *
  • The names of the variables are different, but we give a translation into + * Ukkonen's names.
  • + *
  • Many variables are made "global" by realizing them as fields. This way we + * can easily deal with those tuple return values without constructing extra + * classes.
  • + *
  • String indices start at 0 (not at 1).
  • + *
  • Substrings are marked by the first index and the index after the last one + * (just as in C++ STL) instead of the first and the last index (i.e. intervals + * are right-open instead of closed). This makes it more intuitive to express + * the empty string (i.e. (i,i) instead of (i,i-1)).
  • + *
+ *

+ * Everything but the construction itself is protected to simplify increasing + * its functionality by subclassing but without introducing new method calls. + * + * @author Benjamin Hummel + * @author $Author: kinnen $ + * + * @version $Revision: 41751 $ + * @ConQAT.Rating GREEN Hash: 4B2EF0606B3085A6831764ED042FF20D + */ +class SuffixTree +{ + /** + * Infinity in this context. + * + * @var int + */ + protected $INFTY; + + /** + * The word we are working on. + * + * @var AbstractToken[] + */ + protected $word; + + /** + * The number of nodes created so far. + * + * @var int + */ + protected $numNodes = 0; + + /** + * For each node this holds the index of the first character of + * {@link #word} labeling the transition to this node. This + * corresponds to the k for a transition used in Ukkonen's paper. + * + * @var int[] + */ + protected $nodeWordBegin; + + /** + * For each node this holds the index of the one after the last character of + * {@link #word} labeling the transition to this node. This + * corresponds to the p for a transition used in Ukkonen's paper. + * + * @var int[] + */ + protected $nodeWordEnd; + + /** For each node its suffix link (called function f by Ukkonen). + * @var int[] */ + protected $suffixLink; + + /** + * The next node function realized as a hash table. This corresponds to the + * g function used in Ukkonen's paper. + * + * @var SuffixTreeHashTable + */ + protected $nextNode; + + /** + * An array giving for each node the index where the first child will be + * stored (or -1 if it has no children). It is initially empty and will be + * filled "on demand" using + * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} + * . + * + * @var int[] + */ + protected $nodeChildFirst = []; + + /** + * This array gives the next index of the child list or -1 if this is the + * last one. It is initially empty and will be filled "on demand" using + * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} + * . + * + * @var int[] + */ + protected $nodeChildNext = []; + + /** + * This array stores the actual name (=number) of the mode in the child + * list. It is initially empty and will be filled "on demand" using + * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} + * . + * + * @var int[] + */ + protected $nodeChildNode = []; + + /** + * The node we are currently at as a "global" variable (as it is always + * passed unchanged). This is called s in Ukkonen's paper. + * + * @var int + */ + private $currentNode = 0; + + /** + * Beginning of the word part of the reference pair. This is kept "global" + * (in constrast to the end) as this is passed unchanged to all functions. + * Ukkonen calls this k. + * + * @var int + */ + private $refWordBegin = 0; + + /** + * This is the new (or old) explicit state as returned by + * {@link #testAndSplit(int, Object)}. Ukkonen calls this r. + * + * @var int + */ + private $explicitNode = 0; + + /** + * Create a new suffix tree from a given word. The word given as parameter + * is used internally and should not be modified anymore, so copy it before + * if required. + * + * @param AbstractToken[] $word + */ + public function __construct($word) + { + $this->word = $word; + $size = count($word); + $this->INFTY = $size; + + $expectedNodes = 2 * $size; + $this->nodeWordBegin = array_fill(0, $expectedNodes, 0); + $this->nodeWordEnd = array_fill(0, $expectedNodes, 0); + $this->suffixLink = array_fill(0, $expectedNodes, 0); + $this->nextNode = new SuffixTreeHashTable($expectedNodes); + + $this->createRootNode(); + + for ($i = 0; $i < $size; $i++) { + $this->update($i); + $this->canonize($i + 1); + } + } + + /** + * This method makes sure the child lists are filled (required for + * traversing the tree). + */ + protected function ensureChildLists(): void + { + if ($this->nodeChildFirst == null || count($this->nodeChildFirst) < $this->numNodes) { + $this->nodeChildFirst = array_fill(0, $this->numNodes, 0); + $this->nodeChildNext = array_fill(0, $this->numNodes, 0); + $this->nodeChildNode = array_fill(0, $this->numNodes, 0); + $this->nextNode->extractChildLists($this->nodeChildFirst, $this->nodeChildNext, $this->nodeChildNode); + } + } + + /** + * Creates the root node. + */ + private function createRootNode(): void + { + $this->numNodes = 1; + $this->nodeWordBegin[0] = 0; + $this->nodeWordEnd[0] = 0; + $this->suffixLink[0] = -1; + } + + /** + * The update function as defined in Ukkonen's paper. This inserts + * the character at charPos into the tree. It works on the canonical + * reference pair ({@link #currentNode}, ({@link #refWordBegin}, charPos)). + */ + private function update(int $charPos): void + { + $lastNode = 0; + + while (!$this->testAndSplit($charPos, $this->word[$charPos])) { + $newNode = $this->numNodes++; + $this->nodeWordBegin[$newNode] = $charPos; + $this->nodeWordEnd[$newNode] = $this->INFTY; + $this->nextNode->put($this->explicitNode, $this->word[$charPos], $newNode); + + if ($lastNode != 0) { + $this->suffixLink[$lastNode] = $this->explicitNode; + } + $lastNode = $this->explicitNode; + $this->currentNode = $this->suffixLink[$this->currentNode]; + $this->canonize($charPos); + } + + if ($lastNode != 0) { + $this->suffixLink[$lastNode] = $this->currentNode; + } + } + + /** + * The test-and-split function as defined in Ukkonen's paper. This + * checks whether the state given by the canonical reference pair ( + * {@link #currentNode}, ({@link #refWordBegin}, refWordEnd)) is the end + * point (by checking whether a transition for the + * nextCharacter exists). Additionally the state is made + * explicit if it not already is and this is not the end-point. It returns + * true if the end-point was reached. The newly created (or reached) + * explicit node is returned in the "global" variable. + */ + private function testAndSplit(int $refWordEnd, AbstractToken $nextCharacter): bool + { + if ($this->currentNode < 0) { + // trap state is always end state + return true; + } + + if ($refWordEnd <= $this->refWordBegin) { + if ($this->nextNode->get($this->currentNode, $nextCharacter) < 0) { + $this->explicitNode = $this->currentNode; + + return false; + } + + return true; + } + + $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]); + + if ($nextCharacter->equals($this->word[$this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin])) { + return true; + } + + // not an end-point and not explicit, so make it explicit. + $this->explicitNode = $this->numNodes++; + $this->nodeWordBegin[$this->explicitNode] = $this->nodeWordBegin[$next]; + $this->nodeWordEnd[$this->explicitNode] = $this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin; + $this->nextNode->put($this->currentNode, $this->word[$this->refWordBegin], $this->explicitNode); + + $this->nodeWordBegin[$next] += $refWordEnd - $this->refWordBegin; + $this->nextNode->put($this->explicitNode, $this->word[$this->nodeWordBegin[$next]], $next); + + return false; + } + + /** + * The canonize function as defined in Ukkonen's paper. Changes the + * reference pair (currentNode, (refWordBegin, refWordEnd)) into a canonical + * reference pair. It works on the "global" variables {@link #currentNode} + * and {@link #refWordBegin} and the parameter, writing the result back to + * the globals. + * + * @param int $refWordEnd one after the end index for the word of the reference pair + */ + private function canonize(int $refWordEnd): void + { + if ($this->currentNode === -1) { + // explicitly handle trap state + $this->currentNode = 0; + $this->refWordBegin++; + } + + if ($refWordEnd <= $this->refWordBegin) { + // empty word, so already canonical + return; + } + + $next = $this->nextNode->get( + $this->currentNode, + $this->word[$this->refWordBegin] + ); + + while ($this->nodeWordEnd[$next] - $this->nodeWordBegin[$next] <= $refWordEnd + - $this->refWordBegin) { + $this->refWordBegin += $this->nodeWordEnd[$next] - $this->nodeWordBegin[$next]; + $this->currentNode = $next; + + if ($refWordEnd > $this->refWordBegin) { + $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]); + } else { + break; + } + } + } +} diff --git a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php new file mode 100644 index 00000000..4bf807ee --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php @@ -0,0 +1,232 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; + +/** + * The hash table used for the {@link SuffixTree} class. It is specifically + * written and optimized for its implementation and is thus probably of little + * use for any other application. + *

+ * It hashes from (node, character) pairs to the next node, where nodes are + * represented by integers and the type of characters is determined by the + * generic parameter. + * + * @author Benjamin Hummel + * @author $Author: juergens $ + * + * @version $Revision: 34670 $ + * @ConQAT.Rating GREEN Hash: 6A7A830078AF0CA9C2D84C148F336DF4 + */ +class SuffixTreeHashTable +{ + /** + * These numbers were taken from + * http://planetmath.org/encyclopedia/GoodHashTablePrimes.html. + * + * @var int[] + */ + private $allowedSizes = [53, 97, 193, 389, 769, 1543, + 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, + 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, + 201326611, 402653189, 805306457, 1610612741, ]; + + /** + * The size of the hash table. + * + * @var int + */ + private $tableSize; + + /** + * Storage space for the node part of the key. + * + * @var int[] + */ + private $keyNodes; + + /** + * Storage space for the character part of the key. + * + * @var array + */ + private $keyChars; + + /** + * Storage space for the result node. + * + * @var int[] + */ + private $resultNodes; + + /** + * Debug info: number of stored nodes. + * + * @var int + */ + private $_numStoredNodes = 0; + + /** + * Debug info: number of calls to find so far. + * + * @var int + */ + private $_numFind = 0; + + /** + * Debug info: number of collisions (i.e. wrong finds) during find so far. + * + * @var int + */ + private $_numColl = 0; + + /** + * Creates a new hash table for the given number of nodes. Trying to add + * more nodes will result in worse performance down to entering an infinite + * loop on some operations. + */ + public function __construct(int $numNodes) + { + $minSize = (int) ceil(1.5 * $numNodes); + $sizeIndex = 0; + + while ($this->allowedSizes[$sizeIndex] < $minSize) { + $sizeIndex++; + } + $this->tableSize = $this->allowedSizes[$sizeIndex]; + + $this->keyNodes = array_fill(0, $this->tableSize, 0); + $this->keyChars = array_fill(0, $this->tableSize, null); + $this->resultNodes = array_fill(0, $this->tableSize, 0); + } + + /** + * Returns the next node for the given (node, character) key pair or a + * negative value if no next node is stored for this key. + */ + public function get(int $keyNode, AbstractToken $keyChar): int + { + $pos = $this->hashFind($keyNode, $keyChar); + + if ($this->keyChars[$pos] === null) { + return -1; + } + + return $this->resultNodes[$pos]; + } + + /** + * Inserts the given result node for the (node, character) key pair. + */ + public function put(int $keyNode, AbstractToken $keyChar, int $resultNode): void + { + $pos = $this->hashFind($keyNode, $keyChar); + + if ($this->keyChars[$pos] == null) { + $this->_numStoredNodes++; + $this->keyChars[$pos] = $keyChar; + $this->keyNodes[$pos] = $keyNode; + } + $this->resultNodes[$pos] = $resultNode; + } + + /** + * Extracts the list of child nodes for each node from the hash table + * entries as a linked list. All arrays are expected to be initially empty + * and of suitable size (i.e. for n nodes it should have size + * n given that nodes are numbered 0 to n-1). Those arrays will be + * filled from this method. + *

+ * The method is package visible, as it is tighly coupled to the + * {@link SuffixTree} class. + * + * @param int[] $nodeFirstIndex an array giving for each node the index where the first child + * will be stored (or -1 if it has no children) + * @param int[] $nodeNextIndex this array gives the next index of the child list or -1 if + * this is the last one + * @param int[] $nodeChild this array stores the actual name (=number) of the mode in the + * child list + */ + public function extractChildLists(array &$nodeFirstIndex, array &$nodeNextIndex, array &$nodeChild): void + { + // Instead of Arrays.fill($nodeFirstIndex, -1); + foreach (array_keys($nodeFirstIndex) as $k) { + $nodeFirstIndex[$k] = -1; + } + $free = 0; + + for ($i = 0; $i < $this->tableSize; $i++) { + if ($this->keyChars[$i] !== null) { + // insert $this->keyNodes[$i] -> $this->resultNodes[$i] + $nodeChild[$free] = $this->resultNodes[$i]; + $nodeNextIndex[$free] = $nodeFirstIndex[$this->keyNodes[$i]]; + $nodeFirstIndex[$this->keyNodes[$i]] = $free++; + } + } + } + + /** + * Returns the position of the (node,char) key in the hash map or the + * position to insert it into if it is not yet in. + */ + private function hashFind(int $keyNode, AbstractToken $keyChar): int + { + $this->_numFind++; + $hash = $keyChar->hashCode(); + $pos = $this->posMod($this->primaryHash($keyNode, $hash)); + $secondary = $this->secondaryHash($keyNode, $hash); + + while ($this->keyChars[$pos] !== null) { + if ($this->keyNodes[$pos] === $keyNode && $keyChar->equals($this->keyChars[$pos])) { + break; + } + $this->_numColl++; + $pos = ($pos + $secondary) % $this->tableSize; + } + + return $pos; + } + + /** + * Returns the primary hash value for a (node, character) key pair. + */ + private function primaryHash(int $keyNode, int $keyCharHash): int + { + return $keyCharHash ^ (13 * $keyNode); + } + + /** + * Returns the secondary hash value for a (node, character) key pair. + */ + private function secondaryHash(int $keyNode, int $keyCharHash): int + { + $result = $this->posMod(($keyCharHash ^ (1025 * $keyNode))); + + if ($result == 0) { + return 2; + } + + return $result; + } + + /** + * Returns the smallest non-negative number congruent to x modulo + * {@link #tableSize}. + */ + private function posMod(int $x): int + { + $x %= $this->tableSize; + + if ($x < 0) { + $x += $this->tableSize; + } + + return $x; + } +} diff --git a/src/Detector/Strategy/SuffixTree/Token.php b/src/Detector/Strategy/SuffixTree/Token.php new file mode 100644 index 00000000..d34f9074 --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/Token.php @@ -0,0 +1,42 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; + +class Token extends AbstractToken +{ + public function __construct( + int $tokenCode, + string $tokenName, + int $line, + string $file, + string $content + ) { + $this->tokenCode = $tokenCode; + $this->tokenName = $tokenName; + $this->line = $line; + $this->content = $content; + $this->file = $file; + } + + public function __toString(): string + { + return $this->tokenName; + } + + public function hashCode(): int + { + return crc32($this->content); + } + + public function equals(AbstractToken $token): bool + { + return $token->hashCode() === $this->hashCode(); + } +} diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php new file mode 100644 index 00000000..73528e9c --- /dev/null +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -0,0 +1,105 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy; + +use function array_keys; +use function file_get_contents; +use function is_array; +use function token_get_all; +use Exception; +use SebastianBergmann\PHPCPD\CodeClone; +use SebastianBergmann\PHPCPD\CodeCloneFile; +use SebastianBergmann\PHPCPD\CodeCloneMap; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\AbstractToken; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\ApproximateCloneDetectingSuffixTree; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Sentinel; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Token; + +/** + * For the design of the algorithm, all credits go to the authors of "Do Code Clones Matter?". + */ +final class SuffixTreeStrategy extends AbstractStrategy +{ + /** + * @var AbstractToken[] + */ + private $word = []; + + /** + * @var ?CodeCloneMap + */ + private $result; + + public function processFile(string $file, CodeCloneMap $result): void + { + $content = file_get_contents($file); + $tokens = token_get_all($content); + + foreach (array_keys($tokens) as $key) { + $token = $tokens[$key]; + + if (is_array($token)) { + if (!isset($this->tokensIgnoreList[$token[0]])) { + $this->word[] = new Token( + $token[0], + token_name($token[0]), + $token[2], + $file, + $token[1] + ); + } + } + } + + $this->result = $result; + } + + public function postProcess(): void + { + if (empty($this->result)) { + throw new Exception('Missing result'); + } + + // Sentinel = End of word + $this->word[] = new Sentinel(); + + $tree = new ApproximateCloneDetectingSuffixTree($this->word); + $cloneInfos = $tree->findClones( + $this->config->getMinTokens(), + $this->config->getEditDistance(), + $this->config->getHeadEquality() + ); + + foreach ($cloneInfos as $cloneInfo) { + /** @var int[] */ + $others = $cloneInfo->otherClones->extractFirstList(); + + for ($j = 0; $j < count($others); $j++) { + $otherStart = $others[$j]; + $t = $this->word[$otherStart]; + $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length]; + // If we stumbled upon the Sentinel, rewind one step. + if ($lastToken instanceof Sentinel) { + $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 2]; + } + $lines = $lastToken->line - $cloneInfo->token->line; + $this->result->add( + new CodeClone( + new CodeCloneFile($cloneInfo->token->file, $cloneInfo->token->line), + new CodeCloneFile($t->file, $t->line), + $lines, + // TODO: Double check this + $otherStart + 1 - $cloneInfo->position + ) + ); + } + } + } +} diff --git a/tests/fixture/editdistance1.php b/tests/fixture/editdistance1.php new file mode 100644 index 00000000..61a13c3a --- /dev/null +++ b/tests/fixture/editdistance1.php @@ -0,0 +1,27 @@ +question_l10ns->rows->row)) { + // Edit difference here. + if ($bTranslateLinksFields) { + $insertdata['question'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['question']); + $insertdata['help'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['help']); + } + $oQuestionL10n = new QuestionL10n(); + $oQuestionL10n->question = $insertdata['question']; + $oQuestionL10n->help = $insertdata['help']; + $oQuestionL10n->language = $insertdata['language']; + unset($insertdata['question']); + unset($insertdata['help']); + unset($insertdata['language']); +} + +// For some reason, two exact files will lead to one 0-line clone. +$a = 10; diff --git a/tests/fixture/editdistance2.php b/tests/fixture/editdistance2.php new file mode 100644 index 00000000..14b44676 --- /dev/null +++ b/tests/fixture/editdistance2.php @@ -0,0 +1,24 @@ +question_l10ns->rows->row)) { + // Edit difference here. + if ($options['translinkfields']) { + $insertdata['question'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['question']); + $insertdata['help'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['help']); + } + $oQuestionL10n = new QuestionL10n(); + $oQuestionL10n->question = $insertdata['question']; + $oQuestionL10n->help = $insertdata['help']; + $oQuestionL10n->language = $insertdata['language']; + unset($insertdata['question']); + unset($insertdata['help']); + unset($insertdata['language']); +} + +foo(); diff --git a/tests/fixture/type3_clone.php b/tests/fixture/type3_clone.php new file mode 100644 index 00000000..5557e0bd --- /dev/null +++ b/tests/fixture/type3_clone.php @@ -0,0 +1,40 @@ + $b) { + return 'foo'; + } else { + return 'bar'; + } +} + +function bar() +{ + $a = 10; + $b = 20; + if ($a > $b) { + } else { + return 'bar'; + } +} + +function bar() +{ + $a = 10; + $b = '20'; + if ($a) { + return 'foo'; + } else { + return 'bar'; + } +} diff --git a/tests/unit/DetectorTest.php b/tests/unit/DetectorTest.php index c7d61813..fefcb0f8 100644 --- a/tests/unit/DetectorTest.php +++ b/tests/unit/DetectorTest.php @@ -13,11 +13,18 @@ use function next; use function sort; use PHPUnit\Framework\TestCase; +use SebastianBergmann\PHPCPD\ArgumentsBuilder; +use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; +use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; /** + * @covers \SebastianBergmann\PHPCPD\Arguments + * @covers \SebastianBergmann\PHPCPD\ArgumentsBuilder * @covers \SebastianBergmann\PHPCPD\Detector\Detector + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration * * @uses \SebastianBergmann\PHPCPD\CodeClone * @uses \SebastianBergmann\PHPCPD\CodeCloneFile @@ -28,11 +35,11 @@ final class DetectorTest extends TestCase /** * @dataProvider strategyProvider * - * @psalm-param class-string $strategy + * @psalm-param AbstractStrategy $strategy */ - public function testDetectingSimpleClonesWorks(string $strategy): void + public function testDetectingSimpleClonesWorks(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $clones = (new Detector($strategy))->copyPasteDetection( [__DIR__ . '/../fixture/Math.php'] ); @@ -117,18 +124,19 @@ public function testDetectingSimpleClonesWorks(string $strategy): void /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testDetectingExactDuplicateFilesWorks(string $strategy): void + public function testDetectingExactDuplicateFilesWorks(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '50']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + + $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/a.php', __DIR__ . '/../fixture/b.php', - ], - 20, - 60 + ] ); $clones = $clones->clones(); @@ -149,23 +157,25 @@ public function testDetectingExactDuplicateFilesWorks(string $strategy): void /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testDetectingClonesInMoreThanTwoFiles(string $strategy): void + public function testDetectingClonesInMoreThanTwoFiles(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '60']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + + $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/a.php', __DIR__ . '/../fixture/b.php', __DIR__ . '/../fixture/c.php', - ], - 20, - 60 + ] ); $clones = $clones->clones(); - $files = $clones[0]->files(); + //var_dump($clones); + $files = $clones[0]->files(); sort($files); $file = current($files); @@ -187,18 +197,18 @@ public function testDetectingClonesInMoreThanTwoFiles(string $strategy): void /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(string $strategy): void + public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '61']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/a.php', __DIR__ . '/../fixture/b.php', - ], - 20, - 61 + ] ); $this->assertCount(0, $clones->clones()); @@ -206,18 +216,18 @@ public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(string $st /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(string $strategy): void + public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $argv = [1 => '.', '--min-lines', '21', '--min-tokens', '60']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/a.php', __DIR__ . '/../fixture/b.php', - ], - 21, - 60 + ] ); $this->assertCount(0, $clones->clones()); @@ -225,19 +235,18 @@ public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(string $stra /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testFuzzyClonesAreFound(string $strategy): void + public function testFuzzyClonesAreFound(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $argv = [1 => '.', '--min-lines', '5', '--min-tokens', '20', '--fuzzy', 'true']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/a.php', __DIR__ . '/../fixture/d.php', - ], - 5, - 20, - true + ] ); $this->assertCount(1, $clones->clones()); @@ -245,25 +254,30 @@ public function testFuzzyClonesAreFound(string $strategy): void /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testStripComments(string $strategy): void + public function testStripComments(AbstractStrategy $strategy): void { - $detector = new Detector(new $strategy); + $argv = [1 => '.', '--min-lines', '8', '--min-tokens', '10', '--fuzzy', 'true']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + + $detector = new Detector($strategy); $clones = $detector->copyPasteDetection( [ __DIR__ . '/../fixture/e.php', __DIR__ . '/../fixture/f.php', - ], - 8, - 10, - true + ] ); $this->assertCount(0, $clones->clones()); + $argv = [1 => '.', '--min-lines', '7', '--min-tokens', '10', '--fuzzy', 'true']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + $clones = $detector->copyPasteDetection( [ __DIR__ . '/../fixture/e.php', @@ -278,12 +292,17 @@ public function testStripComments(string $strategy): void } /** - * @psalm-return list + * @psalm-return list */ public function strategyProvider(): array { + // Build default config. + $argv = [1 => '.']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + return [ - [DefaultStrategy::class], + [new DefaultStrategy($config)], ]; } } diff --git a/tests/unit/EditDistanceTest.php b/tests/unit/EditDistanceTest.php new file mode 100644 index 00000000..51fbf14b --- /dev/null +++ b/tests/unit/EditDistanceTest.php @@ -0,0 +1,75 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector; + +use PHPUnit\Framework\TestCase; +use SebastianBergmann\PHPCPD\ArgumentsBuilder; +use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; +use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; + +/** + * @covers \SebastianBergmann\PHPCPD\Arguments + * @covers \SebastianBergmann\PHPCPD\ArgumentsBuilder + * @covers \SebastianBergmann\PHPCPD\Detector\Detector + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\ApproximateCloneDetectingSuffixTree + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\CloneInfo + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\PairList + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Sentinel + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\SuffixTree + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\SuffixTreeHashTable + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Token + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy + * + * @uses \SebastianBergmann\PHPCPD\CodeClone + * @uses \SebastianBergmann\PHPCPD\CodeCloneFile + * @uses \SebastianBergmann\PHPCPD\CodeCloneMap + */ +final class EditDistanceTest extends TestCase +{ + public function testEditDistanceWithSuffixtree(): void + { + $argv = [1 => '.', '--min-tokens', '60']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy = new SuffixTreeStrategy($config); + + $clones = (new Detector($strategy))->copyPasteDetection( + [ + __DIR__ . '/../fixture/editdistance1.php', + __DIR__ . '/../fixture/editdistance2.php', + ], + ); + + $clones = $clones->clones(); + $this->assertCount(1, $clones); + } + + public function testEditDistanceWithRabinkarp(): void + { + $argv = [1 => '.', '--min-tokens', '60']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy = new DefaultStrategy($config); + + $clones = (new Detector($strategy))->copyPasteDetection( + [ + __DIR__ . '/../fixture/editdistance1.php', + __DIR__ . '/../fixture/editdistance2.php', + ], + ); + + $clones = $clones->clones(); + $this->assertCount(0, $clones); + } +}