125 lines
3.7 KiB
PHP
125 lines
3.7 KiB
PHP
<?php
|
|
|
|
namespace App\Console\Commands;
|
|
|
|
use App\Helpers\FirstParty\ImageHash\ImageHashService;
|
|
use App\Models\MemeMedia;
|
|
use Illuminate\Console\Command;
|
|
|
|
class FindDuplicateImages extends Command
|
|
{
|
|
/**
|
|
* The name and signature of the console command.
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $signature = 'app:find-duplicate-images {--threshold=5 : Hamming distance threshold for duplicates}';
|
|
|
|
/**
|
|
* The console command description.
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $description = 'Find duplicate images using Hamming distance comparison of WebP hashes';
|
|
|
|
private ImageHashService $imageHashService;
|
|
|
|
public function __construct(ImageHashService $imageHashService)
|
|
{
|
|
parent::__construct();
|
|
$this->imageHashService = $imageHashService;
|
|
}
|
|
|
|
/**
|
|
* Execute the console command.
|
|
*/
|
|
public function handle()
|
|
{
|
|
$threshold = (int) $this->option('threshold');
|
|
|
|
$records = MemeMedia::whereNotNull('image_hash')
|
|
->whereNotNull('webp_url')
|
|
->get(['id', 'name', 'image_hash', 'webp_url']);
|
|
|
|
if ($records->isEmpty()) {
|
|
$this->info('No records with image hashes found. Run app:generate-image-hashes first.');
|
|
|
|
return;
|
|
}
|
|
|
|
$this->info("Checking {$records->count()} records for duplicates with threshold: {$threshold}");
|
|
|
|
$duplicates = [];
|
|
$processed = [];
|
|
|
|
foreach ($records as $record) {
|
|
if (in_array($record->id, $processed)) {
|
|
continue;
|
|
}
|
|
|
|
$similarRecords = [];
|
|
|
|
foreach ($records as $compareRecord) {
|
|
if ($record->id === $compareRecord->id || in_array($compareRecord->id, $processed)) {
|
|
continue;
|
|
}
|
|
|
|
$distance = $this->imageHashService->calculateHammingDistance(
|
|
$record->image_hash,
|
|
$compareRecord->image_hash
|
|
);
|
|
|
|
if ($distance <= $threshold) {
|
|
$similarRecords[] = [
|
|
'id' => $compareRecord->id,
|
|
'name' => $compareRecord->name,
|
|
'distance' => $distance,
|
|
'url' => $compareRecord->webp_url,
|
|
];
|
|
$processed[] = $compareRecord->id;
|
|
}
|
|
}
|
|
|
|
if (! empty($similarRecords)) {
|
|
$duplicates[] = [
|
|
'original' => [
|
|
'id' => $record->id,
|
|
'name' => $record->name,
|
|
'url' => $record->webp_url,
|
|
],
|
|
'duplicates' => $similarRecords,
|
|
];
|
|
$processed[] = $record->id;
|
|
}
|
|
}
|
|
|
|
if (empty($duplicates)) {
|
|
$this->info('No duplicates found.');
|
|
|
|
return;
|
|
}
|
|
|
|
$this->info('Found '.count($duplicates).' duplicate groups:');
|
|
|
|
foreach ($duplicates as $group) {
|
|
$this->newLine();
|
|
$this->line("Original: [{$group['original']['id']}] {$group['original']['name']}");
|
|
$this->line("URL: {$group['original']['url']}");
|
|
|
|
foreach ($group['duplicates'] as $duplicate) {
|
|
$this->line(" → [{$duplicate['id']}] {$duplicate['name']} (distance: {$duplicate['distance']})");
|
|
$this->line(" URL: {$duplicate['url']}");
|
|
}
|
|
}
|
|
|
|
$this->newLine();
|
|
$this->info('Total duplicate groups: '.count($duplicates));
|
|
|
|
$totalDuplicates = array_sum(array_map(function ($group) {
|
|
return count($group['duplicates']);
|
|
}, $duplicates));
|
|
|
|
$this->info("Total duplicate records: {$totalDuplicates}");
|
|
}
|
|
}
|