Skip to content

Commit

Permalink
fix: various updates
Browse files Browse the repository at this point in the history
  • Loading branch information
CS76 committed Aug 20, 2024
1 parent ee160e9 commit 7f31573
Show file tree
Hide file tree
Showing 8 changed files with 443 additions and 227 deletions.
210 changes: 165 additions & 45 deletions app/Console/Commands/AssignIdentifiers.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use App\Models\Ticker;
use DB;
use Illuminate\Console\Command;
use Illuminate\Support\Collection;

class AssignIdentifiers extends Command
{
Expand All @@ -28,50 +29,175 @@ class AssignIdentifiers extends Command
*/
public function handle()
{
$batchSize = 1000;
$batchSize = 10000;
$currentIndex = $this->fetchLastIndex() + 1;
$data = [];
$this->info('Mapping parents');
Molecule::select('identifier', 'id', 'has_variants')->where([
['has_stereo', '=', false],
['identifier', '=', null],
])->chunk($batchSize, function ($molecules) use (&$currentIndex) {
$data = [];
$header = ['id', 'identifier'];
foreach ($molecules as $molecule) {
if (! $molecule->identifier) {
$data[] = array_combine($header, [$molecule->id, $this->generateIdentifier($currentIndex)]);
$currentIndex++;
}
}
$this->insertBatch($data);
});
$this->info('Mapping parents: Done');

$this->info('Mapping variants');
Molecule::select('identifier', 'id', 'has_variants')->where(
[
['is_parent', '=', true],
['identifier', '!=', null],
])->chunk($batchSize, function ($molecules) {
$data = [];
$header = ['id', 'identifier'];
foreach ($molecules as $molecule) {
$i = 1;
$variants = $molecule->variants;
foreach ($variants as $variant) {
$data[] = array_combine($header, [$variant->id, $molecule->identifier.'.'.$i]);
$i++;
}

// // Step: 1
// $parents = DB::table('molecules')
// ->select('id', 'identifier')
// ->where('has_stereo', false)
// ->whereNull('identifier')
// ->get();

// $parents->chunk($batchSize)->each(function ($moleculesChunk) use (&$currentIndex) {
// $data = [];
// $header = ['id', 'identifier'];
// foreach ($moleculesChunk as $molecule) {
// echo($molecule->id . ' - ' . $currentIndex);
// echo("\r\n");
// if (! $molecule->identifier) {
// $data[] = array_combine($header, [$molecule->id, $this->generateIdentifier($currentIndex, 'parent')]);
// $currentIndex++;
// }
// }
// $this->insertBatch($data);
// });

// $this->info('Mapping parents: Done');

// // Step: 2
// $mappings = DB::table('molecules')
// ->select('parent_id', 'id')
// ->whereNotNull('parent_id')
// ->where('has_stereo', true)
// ->get()
// ->groupBy('parent_id')
// ->map(function ($items) {
// return $items->pluck('id')->sort()->values()->toArray();
// });

// $identifier_mappings = DB::table('molecules')
// ->where('has_stereo', false)
// ->where('is_parent', true)
// ->pluck('identifier', 'id')
// ->toArray();

// $identifier_mappings = array_map(function($identifier) {
// return str_replace('.0', '', $identifier);
// }, $identifier_mappings);

// $jsonData = json_encode($mappings, JSON_PRETTY_PRINT);
// $filePath = storage_path('parent_id_mappings.json');
// file_put_contents($filePath, $jsonData);

// $jsonData = json_encode($identifier_mappings, JSON_PRETTY_PRINT);
// $filePath = storage_path('identifier_mappings.json');
// file_put_contents($filePath, $jsonData);

// // Step: 3
// $mappings = json_decode(file_get_contents(storage_path('parent_id_mappings.json')), true);
// $identifier_mappings = json_decode(file_get_contents(storage_path('identifier_mappings.json')), true);

// $bulkUpdateData = [];

// foreach ($mappings as $parentId => $rowIds) {
// if (isset($identifier_mappings[$parentId])) {
// $baseIdentifier = $identifier_mappings[$parentId];

// foreach ($rowIds as $index => $rowId) {
// $newIdentifier = $baseIdentifier . '.' . ($index + 1);

// $bulkUpdateData[] = [
// 'row_id' => $rowId,
// 'identifier' => $newIdentifier
// ];
// }
// }
// }

// $batchSize = 10000;
// $i = 0;
// Collection::make($bulkUpdateData)->chunk($batchSize)->each(function ($chunk) use (&$i) {
// echo($i);
// echo("\r\n");
// DB::transaction(function () use ($chunk) {
// foreach ($chunk as $data) {
// DB::table('molecules')
// ->where('id', $data['row_id'])
// ->update(['identifier' => $data['identifier']]);
// }
// });
// $i++;
// });

// Mapping miss-matched parent_ids
// $nullIdentifiers = DB::table('molecules')
// ->whereNull('identifier')
// ->get();

// $mapped_data = array_map('str_getcsv', file(storage_path('Mapped_IDs.csv')));
// Initialize an associative array to store the key-value pairs
// $associativeArray = [];

// // Iterate through the array and map keys to values
// foreach ($mapped_data as $row) {
// if (isset($row[0], $row[1])) { // Check if both key and value exist
// $associativeArray[$row[0]] = $row[1];
// }
// }

// $nullIdentifiers->chunk(100)->each(function ($chunk) use ($associativeArray) {
// foreach ($chunk as $molecule) {
// if (isset($associativeArray[$molecule->parent_id])) {
// $parentId = $associativeArray[$molecule->parent_id];
// echo($molecule->parent_id . ' - '. $parentId);
// echo("\r\n");
// // Update the row with the parent_id
// DB::table('molecules')
// ->where('id', $molecule->id)
// ->update(['parent_id' => $parentId]);
// }
// }
// });
// // $identifier_mappings = json_decode(file_get_contents(storage_path('identifier_mappings.json')), true);

// $mappings = json_decode(file_get_contents(storage_path('parent_id_mappings.json')), true);
// $bulkUpdateData = [];
// foreach ($mappings as $parentId => $rowIds) {
// echo($parentId);
// echo("\r\n");
// $bulkUpdateData[] = [
// 'row_id' => $parentId
// ];
// }

// Collection::make($bulkUpdateData)->chunk($batchSize)->each(function ($chunk) use (&$i) {
// echo($i);
// echo("\r\n");
// DB::transaction(function () use ($chunk) {
// foreach ($chunk as $data) {
// DB::table('molecules')
// ->where('id', $data['row_id'])
// ->update(['has_variants' => true]);
// }
// });
// $i++;
// });

$mapped_data = array_map('str_getcsv', file(storage_path('collection_molecule_no_duplicates.csv')));

Collection::make($mapped_data)->chunk($batchSize)->each(function ($chunk) use (&$i) {
echo $i;
echo "\r\n";
DB::transaction(function () use ($chunk) {
foreach ($chunk as $data) {
DB::table('molecules')
->where('id', $data)
->update(['is_placeholder' => false]);
}
$this->insertBatch($data);
});
$this->info('Mapping variants: Done');
$i++;
});
}

public function generateIdentifier($index)
public function generateIdentifier($index, $type)
{
return 'CNP'.str_pad($index, 7, '0', STR_PAD_LEFT);
if ($type == 'parent') {
return 'CNP'.str_pad($index, 7, '0', STR_PAD_LEFT).'.0';
} else {
return 'CNP'.str_pad($index, 7, '0', STR_PAD_LEFT);
}
}

public function fetchLastIndex()
Expand All @@ -90,14 +216,8 @@ private function insertBatch(array $data)
{
DB::transaction(function () use ($data) {
foreach ($data as $row) {
Molecule::updateorCreate(
[
'id' => $row['id'],
],
[
'identifier' => $row['identifier'],
]
);
Molecule::where('id', $row['id'])
->update(['identifier' => $row['identifier']]);
}
});
}
Expand Down
36 changes: 36 additions & 0 deletions app/Console/Commands/ExtractCAS.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,42 @@ public function handle()
}
$this->insertBatch($data);
});

Molecule::whereNotNull('cas')->select('id', 'cas')->chunk(10000, function ($mols) {
$data = [];
$pattern = "/\b[1-9][0-9]{1,5}-\d{2}-\d\b/";
foreach ($mols as $mol) {
$inputArray = $mol->cas;

// Filter array to remove elements that start with "InChI"
$filteredArray = array_filter($inputArray, function ($element) {
return ! str_starts_with($element, 'InChI');
});

// Modify elements that start with "CAS-"
$modifiedArray = array_map(function ($element) {
if (str_starts_with($element, 'CAS-')) {
return substr($element, 4); // Remove "CAS-" prefix
}

return $element;
}, $filteredArray);

// Remove duplicates
$uniqueArray = array_unique($modifiedArray);

// Reset array keys
$uniqueArray = array_values($uniqueArray);

array_push($data, [
'id' => $mol->id,
'cas' => $uniqueArray,
]);

$this->info("Mapped and updated: $mol->id");
}
$this->insertBatch($data);
});
}

/**
Expand Down
Loading

0 comments on commit 7f31573

Please sign in to comment.