diff --git a/.gitignore b/.gitignore index c99b547bc..b11375e36 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,4 @@ download_log.json +.claude diff --git a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile index bcb2de9cb..19fbd31f9 100644 --- a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile +++ b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile @@ -1,5 +1,5 @@ -ARG TAG -FROM opencb/cellbase-base:$TAG +ARG TAG=latest +FROM opencb/cellbase-base:${TAG} LABEL org.label-schema.vendor="OpenCB" \ org.label-schema.name="cellbase-builder" \ @@ -32,4 +32,4 @@ RUN cd /opt/ensembl && \ ## Give writting permissions to allow the script ensembl_canonical.pl to create sub-folder for cache purposes RUN chmod -R 777 /opt/cellbase/scripts/ensembl-scripts/ -ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts:/opt/ensembl/biomart-perl/lib +ENV PERL5LIB=/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts:/opt/ensembl/biomart-perl/lib diff --git a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl index 3b7939fa9..ec8c93549 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl @@ -69,7 +69,7 @@ print "Generating the JSON file for the Sift version.\n"; $jsonVersion->{"id"} = "sift"; $jsonVersion->{"name"} = "Sift"; -open(FILE, ">".$outdir."/siftVersion.json") || die "error opening file\n"; +open(FILE, ">".$outdir."/siftVersion.json") || die "error opening file [$outdir/siftVersion.json]: $!\n"; print FILE to_json($jsonVersion) . "\n"; close(FILE); @@ -77,7 +77,7 @@ print "Generating the JSON file for the PolyPhen version\n"; $jsonVersion->{"id"} = "polyphen"; $jsonVersion->{"name"} = "PolyPhen"; -open(FILE, ">".$outdir."/polyphenVersion.json") || die "error opening file\n"; +open(FILE, ">".$outdir."/polyphenVersion.json") || die "error opening file [$outdir/polyphenVersion.json]: $!\n"; print FILE to_json($jsonVersion) . "\n"; close(FILE); @@ -158,7 +158,8 @@ #my @all_chroms = @{$slice_adaptor->fetch_all('chromosome')}; foreach my $chr(@chromosomes) { my @transcripts = @{$chr->get_all_Transcripts()}; - open(FILE, ">".$outdir."/prot_func_pred_chr_".$chr->seq_region_name.".json") || die "error opening file\n"; + my $filename = $outdir."/prot_func_pred_chr_".$chr->seq_region_name.".json"; + open(FILE, ">".$filename) || die "error opening file [$filename]: $!\n"; print @transcripts." transcripts fetched!\n"; foreach my $trans(@transcripts) { if($trans->biotype eq 'protein_coding') { diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index d36af4111..53cf9e121 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -22,6 +22,7 @@ import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.models.Release; import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.core.utils.DatabaseNameUtils; @@ -354,22 +355,34 @@ private void loadConservation() throws IOException, CellBaseException { private void loadProteinFunctionalPrediction() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { + // Check if SIFT/POLYPHEN source has already been loaded + checkSourceAlreadyLoaded(SIFT_DATA); + checkSourceAlreadyLoaded(POLYPHEN_DATA); + loadData(input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(PROTEIN_FUNCTIONAL_PREDICTION_DATA), PROTEIN_SUBSTITUTION_PREDICTION_DATA, "prot_func_pred_"); } private void loadRevel() throws CellBaseException { + // Check if REVEL source has already been loaded + checkSourceAlreadyLoaded(REVEL_DATA); + HashMap collectionMap = new HashMap<>(); collectionMap.put(PROTEIN_SUBSTITUTION_PREDICTION_DATA, REVEL_DATA + JSON_GZ_EXTENSION); - loadData(input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(REVEL_DATA), collectionMap); + Path revelPath = input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(REVEL_DATA); + loadData(revelPath, collectionMap); } private void loadAlphaMissense() throws CellBaseException { + // Check if AlphaMissense source has already been loaded + checkSourceAlreadyLoaded(ALPHAMISSENSE_DATA); + HashMap collectionMap = new HashMap<>(); collectionMap.put(PROTEIN_SUBSTITUTION_PREDICTION_DATA, ALPHAMISSENSE_DATA + JSON_GZ_EXTENSION); - loadData(input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(ALPHAMISSENSE_DATA), collectionMap); + Path alphaMissensePath = input.resolve(PROTEIN_SUBSTITUTION_PREDICTION_DATA).resolve(ALPHAMISSENSE_DATA); + loadData(alphaMissensePath, collectionMap); } private void loadClinical() throws FileNotFoundException { @@ -677,4 +690,16 @@ private Release getDataReleaseForLoading(DataReleaseManager dataReleaseManager) } return lastDataRelease; } + + private void checkSourceAlreadyLoaded(String sourceId) throws CellBaseException { + Release release = getDataReleaseForLoading(dataReleaseManager); + if (release.getSources() != null) { + for (DataSource source : release.getSources()) { + if (sourceId.equalsIgnoreCase(source.getId())) { + throw new CellBaseException("Loading data '" + sourceId + "' with release " + dataRelease + + " failed: source '" + sourceId + "' already loaded previously"); + } + } + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java index 2d3efefa9..3afbbc0cd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java @@ -18,13 +18,16 @@ import com.mongodb.BasicDBList; import com.mongodb.client.model.Filters; +import com.mongodb.client.model.Projections; import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.collections4.MapUtils; import org.apache.commons.lang3.StringUtils; import org.bson.Document; import org.bson.conversions.Bson; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.Entry; import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; import org.opencb.biodata.models.core.ProteinSubstitutionPredictionScore; +import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.ProteinFeature; import org.opencb.biodata.models.variant.avro.ProteinVariantAnnotation; import org.opencb.biodata.models.variant.avro.Score; @@ -36,6 +39,7 @@ import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.iterator.CellBaseIterator; import org.opencb.cellbase.lib.iterator.CellBaseMongoDBIterator; +import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; @@ -44,6 +48,9 @@ import java.util.*; +import static org.opencb.cellbase.lib.EtlCommons.ALPHAMISSENSE_DATA; +import static org.opencb.cellbase.lib.EtlCommons.REVEL_DATA; + /** * Created by imedina on 01/12/15. */ @@ -51,7 +58,7 @@ public class ProteinMongoDBAdaptor extends CellBaseDBAdaptor implements CellBase private Map proteinSubstitutionMongoDBCollectionByRelease; - private static final int NUM_PROTEIN_SUBSTITUTION_SCORE_METHODS = 2; + private static final int NUM_PROTEIN_SUBSTITUTION_SCORE_METHODS = 4; private static Map aaShortNameMap; @@ -94,24 +101,110 @@ private void init() { proteinSubstitutionMongoDBCollectionByRelease = buildCollectionByReleaseMap("protein_substitution_prediction"); } - public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, Integer aaPosition, String aa) throws CellBaseException { - long dbTimeStart = System.currentTimeMillis(); - Map scoreSet = new HashMap<>(); - - // transcriptId, aaPosition, aaAlternate are needed for this collection - if (query.getTranscriptsId() != null && query.getTranscriptsId().get(0) != null && aaPosition != null - && StringUtils.isNotEmpty(aa)) { + public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, String chromosome, Integer position, Integer aaPosition, + String aa) throws CellBaseException { + CellBaseDataResult result = null; + // Ensembl transcript id is needed for this collection + if (query.getTranscriptsId() != null && query.getTranscriptsId().get(0) != null) { + String transcriptId = query.getTranscriptsId().get(0).split("\\.")[0]; + // Filter for SIFT/POLYPHEN documents: they have the 'size' field, REVEL/ALPHAMISSENSE don't + List andBsonList = new ArrayList<>(); + andBsonList.add(Filters.eq("transcriptId", transcriptId)); + andBsonList.add(Filters.exists("size", true)); + Bson transcript = Filters.and(andBsonList); MongoDBCollection mongoDBCollection = getCollectionByRelease(proteinSubstitutionMongoDBCollectionByRelease, query.getDataRelease()); - List andBsonList = new ArrayList<>(); + String aaShortName = null; + // If position and aa change are provided we create a 'projection' to return only the required data from the database + if (aaPosition != null) { + String projectionString = "aaPositions." + aaPosition; + + // If aa change is provided we only return that information + if (StringUtils.isNotEmpty(aa)) { + aaShortName = aaShortNameMap.get(aa.toUpperCase()); + projectionString += "." + aaShortName; + } + + // Projection is used to minimize the returned data + Bson positionProjection = Projections.include(projectionString); + result = new CellBaseDataResult<>(mongoDBCollection.find(transcript, positionProjection, query.toQueryOptions())); + } else { + // Return the whole transcript data + result = new CellBaseDataResult<>(mongoDBCollection.find(transcript, query.toQueryOptions())); + } + + if (result != null && !result.getResults().isEmpty()) { + Document document = (Document) result.getResults().get(0); + Document aaPositionsDocument = (Document) document.get("aaPositions"); + + // Position or aa change were not provided, returning whole transcript data + if (aaPosition == null || aaPosition == -1 || aaShortName == null) { + // Return only the inner Document, not the whole document projected + result.setResults(Collections.singletonList(aaPositionsDocument)); + // Position and aa were provided, return only corresponding Score objects + } else { + List scoreList = new ArrayList<>(NUM_PROTEIN_SUBSTITUTION_SCORE_METHODS); + if (result.getNumResults() == 1 && aaPositionsDocument != null) { + Document positionDocument = (Document) aaPositionsDocument.get(Integer.toString(aaPosition)); + if (positionDocument != null) { + Document aaDocument = (Document) positionDocument.get(aaShortName); + if (aaDocument != null) { + if (aaDocument.get("ss") != null) { + scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ss")), + "sift", VariantAnnotationUtils.SIFT_DESCRIPTIONS.get(aaDocument.get("se")))); + } + if (aaDocument.get("ps") != null) { + scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ps")), + "polyphen", VariantAnnotationUtils.POLYPHEN_DESCRIPTIONS.get(aaDocument.get("pe")))); + } + } + } + } + + // Query for REVEL and ALPHAMISSENSE scores (different data model) + scoreList.addAll(getRevelAndAlphaMissenseScores(mongoDBCollection, chromosome, position, transcriptId, aaPosition, aa)); + + result.setResults(scoreList); + } + } + } + + // Return null if no transcript id is provided + return result; + } + + private List getRevelAndAlphaMissenseScores(MongoDBCollection mongoDBCollection, String chromosome, Integer position, + String transcriptId, Integer aaPosition, String aa) { + Map scoreSet = new HashMap<>(); + + // aaPosition/position, aaAlternate are needed for this collection + if (transcriptId != null && StringUtils.isNotEmpty(aa)) { // Sanity check, protein substitution predictions do not contain the transcript ID version - String transcriptId = query.getTranscriptsId().get(0).split("\\.")[0]; - andBsonList.add(Filters.eq("transcriptId", transcriptId)); - andBsonList.add(Filters.eq("aaPosition", aaPosition)); + transcriptId = transcriptId.split("\\.")[0]; + + List andBsonList = new ArrayList<>(); + // Query without transcriptId filter (will filter in Java for better performance) + andBsonList.add(Filters.eq("chromosome", chromosome)); + andBsonList.add(Filters.in("source", REVEL_DATA, ALPHAMISSENSE_DATA)); + + // Efficient single OR query: aaPosition for ALPHAMISSENSE, position for REVEL + List orBsonList = new ArrayList<>(); + if (aaPosition != null) { + orBsonList.add(Filters.eq("aaPosition", aaPosition)); // ALPHAMISSENSE + } + if (position != null) { + orBsonList.add(Filters.eq("position", position)); // REVEL + } + if (!orBsonList.isEmpty()) { + andBsonList.add(Filters.or(orBsonList)); + } + String aaAlternate = aaShortNameMap.get(aa.toUpperCase()); - andBsonList.add(Filters.eq("scores.aaAlternate", aaAlternate)); + if (aaAlternate != null) { + andBsonList.add(Filters.eq("scores.aaAlternate", aaAlternate)); + } Bson bson = Filters.and(andBsonList); DataResult predictions = mongoDBCollection.find(bson, null, ProteinSubstitutionPrediction.class, @@ -119,6 +212,11 @@ public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, In if (predictions != null && CollectionUtils.isNotEmpty(predictions.getResults())) { for (ProteinSubstitutionPrediction prediction : predictions.getResults()) { + // Filter by transcriptId in Java (handles both single and semicolon-separated values) + if (!isTranscriptIdMatch(prediction.getTranscriptId(), transcriptId)) { + continue; + } + for (ProteinSubstitutionPredictionScore predictionScore : prediction.getScores()) { if (StringUtils.isNotEmpty(predictionScore.getAaAlternate()) && StringUtils.isNotEmpty(aaAlternate) && predictionScore.getAaAlternate().equals(aaAlternate)) { @@ -133,93 +231,62 @@ public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, In } } - int dbTime = Long.valueOf(System.currentTimeMillis() - dbTimeStart).intValue(); - return new CellBaseDataResult<>("getSubstitutionScores", dbTime, new ArrayList<>(), scoreSet.size(), - new ArrayList<>(scoreSet.values()), scoreSet.size()); + if (MapUtils.isEmpty(scoreSet)) { + return new ArrayList<>(); + } else { + return new ArrayList<>(scoreSet.values()); + } } -// public CellBaseDataResult getSubstitutionScores(Query query, QueryOptions options) { -// CellBaseDataResult result = null; -// -// // Ensembl transcript id is needed for this collection -// if (query.getString("transcript") != null) { -// Bson transcript = Filters.eq("transcriptId", query.getString("transcript")); -// -// int position = -1; -// String aaShortName = null; -// // If position and aa change are provided we create a 'projection' to return only the required data from the database -// if (query.get("position") != null && !query.getString("position").isEmpty() && query.getInt("position", 0) != 0) { -// position = query.getInt("position"); -// String projectionString = "aaPositions." + position; -// -// // If aa change is provided we only return that information -// if (query.getString("aa") != null && !query.getString("aa").isEmpty()) { -// aaShortName = aaShortNameMap.get(query.getString("aa").toUpperCase()); -// projectionString += "." + aaShortName; -// } -// -// // Projection is used to minimize the returned data -// Bson positionProjection = Projections.include(projectionString); -// result = new CellBaseDataResult<>(proteinSubstitutionMongoDBCollection.find(transcript, positionProjection, options)); -// } else { -// // Return the whole transcript data -// result = new CellBaseDataResult<>(proteinSubstitutionMongoDBCollection.find(transcript, options)); -// } -// -// if (result != null && !result.getResults().isEmpty()) { -// Document document = (Document) result.getResults().get(0); -// Document aaPositionsDocument = (Document) document.get("aaPositions"); -// -// // Position or aa change were not provided, returning whole transcript data -// if (position == -1 || aaShortName == null) { -// // Return only the inner Document, not the whole document projected -// result.setResults(Collections.singletonList(aaPositionsDocument)); -// // Position and aa were provided, return only corresponding Score objects -// } else { -// List scoreList = null; -// if (result.getNumResults() == 1 && aaPositionsDocument != null) { -// scoreList = new ArrayList<>(NUM_PROTEIN_SUBSTITUTION_SCORE_METHODS); -// Document positionDocument = (Document) aaPositionsDocument.get(Integer.toString(position)); -// Document aaDocument = (Document) positionDocument.get(aaShortName); -// if (aaDocument.get("ss") != null) { -// scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ss")), -// "sift", VariantAnnotationUtils.SIFT_DESCRIPTIONS.get(aaDocument.get("se")))); -// } -// if (aaDocument.get("ps") != null) { -// scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ps")), -// "polyphen", VariantAnnotationUtils.POLYPHEN_DESCRIPTIONS.get(aaDocument.get("pe")))); -// } -// } -// result.setResults(scoreList); -// } -// } -// } -// // Return null if no transcript id is provided -// return result; -// -// } + /** + * Check if transcriptId matches the stored value. + * Handles both single transcriptId and semicolon-separated list (REVEL format) + * @param storedTranscriptId the value from the database (can be "ENST..." or "ENST...;ENST...;ENST...") + * @param queryTranscriptId the transcript ID we're looking for + * @return true if queryTranscriptId is found in storedTranscriptId + */ + private boolean isTranscriptIdMatch(String storedTranscriptId, String queryTranscriptId) { + if (StringUtils.isEmpty(storedTranscriptId) || StringUtils.isEmpty(queryTranscriptId)) { + return false; + } + + // For single value (ALPHAMISSENSE) or exact match + if (storedTranscriptId.equals(queryTranscriptId)) { + return true; + } - public CellBaseDataResult getVariantAnnotation(String ensemblTranscriptId, int position, String aaReference, - String aaAlternate, QueryOptions options, int dataRelease) + // For semicolon-separated list (REVEL) + // Split and check if queryTranscriptId is in the list + String[] transcriptIds = storedTranscriptId.split(";"); + for (String id : transcriptIds) { + if (id.equals(queryTranscriptId)) { + return true; + } + } + + return false; + } + + public CellBaseDataResult getVariantAnnotation(Variant variant, String ensemblTranscriptId, int aaPosition, + String aaReference, String aaAlternate, QueryOptions options, + int dataRelease) throws CellBaseException { CellBaseDataResult cellBaseDataResult = new CellBaseDataResult<>(); - cellBaseDataResult.setId(ensemblTranscriptId + "/" + position + "/" + aaAlternate); + cellBaseDataResult.setId(ensemblTranscriptId + "/" + aaPosition + "/" + aaAlternate); long dbTimeStart = System.currentTimeMillis(); ProteinVariantAnnotation proteinVariantAnnotation = new ProteinVariantAnnotation(); - proteinVariantAnnotation.setPosition(position); + proteinVariantAnnotation.setPosition(aaPosition); proteinVariantAnnotation.setReference(aaReference); proteinVariantAnnotation.setAlternate(aaAlternate); -// Query query = new Query("transcript", ensemblTranscriptId).append("position", position).append("aa", aaAlternate); // Stop_gain/lost variants do not have SIFT/POLYPHEN scores -// System.out.println("aaReference = " + aaReference); -// System.out.println("aaAlternate = " + aaAlternate); -// if (!aaAlternate.equals("STOP") && !aaReference.equals("STOP")) { + if (!aaAlternate.equals("STOP") && !aaReference.equals("STOP")) { TranscriptQuery query = new TranscriptQuery(); query.setTranscriptsId(Collections.singletonList(ensemblTranscriptId)); query.setDataRelease(dataRelease); - proteinVariantAnnotation.setSubstitutionScores(getSubstitutionScores(query, position, aaAlternate).getResults()); -// } + proteinVariantAnnotation.setSubstitutionScores(getSubstitutionScores(query, variant.getChromosome(), variant.getStart(), + aaPosition, aaAlternate).getResults()); + } CellBaseDataResult proteinVariantData; String shortAlternativeAa = aaShortNameMap.get(aaAlternate); @@ -236,12 +303,12 @@ public CellBaseDataResult getVariantAnnotation(String pipeline.add(new Document("$unwind", "$feature")); BasicDBList andDBList2 = new BasicDBList(); - andDBList2.add(new Document("feature.location.position.position", position)); + andDBList2.add(new Document("feature.location.position.position", aaPosition)); andDBList2.add(new Document("feature.variation", shortAlternativeAa)); Document firstOr = new Document("$and", andDBList2); BasicDBList andDBList3 = new BasicDBList(); - andDBList3.add(new Document("feature.location.end.position", new Document("$gte", position))); - andDBList3.add(new Document("feature.location.begin.position", new Document("$lte", position))); + andDBList3.add(new Document("feature.location.end.position", new Document("$gte", aaPosition))); + andDBList3.add(new Document("feature.location.begin.position", new Document("$lte", aaPosition))); Document secondOr = new Document(); secondOr.put("$and", andDBList3); BasicDBList orList = new BasicDBList(); @@ -256,8 +323,8 @@ public CellBaseDataResult getVariantAnnotation(String pipeline.add(new Document("$group", groupFields)); MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease); - proteinVariantData = executeAggregation2(ensemblTranscriptId + "_" + String.valueOf(position) + "_" - + aaAlternate, pipeline, new QueryOptions(), mongoDBCollection); + proteinVariantData = executeAggregation2(ensemblTranscriptId + "_" + aaPosition + "_" + aaAlternate, pipeline, + new QueryOptions(), mongoDBCollection); if (proteinVariantData.getNumResults() > 0) { proteinVariantAnnotation = processProteinVariantData(proteinVariantAnnotation, shortAlternativeAa, (Document) proteinVariantData.getResults().get(0)); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java index 2c0bb84da..b08806941 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/MongoDBCellBaseLoader.java @@ -51,6 +51,8 @@ import java.util.concurrent.BlockingQueue; import java.util.stream.Collectors; +import static org.opencb.cellbase.lib.EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA; + /** * Created by parce on 18/02/15. */ @@ -137,13 +139,19 @@ private String getCollectionName() throws LoaderException { throw new LoaderException("Invalid data release " + dataRelease + " for database " + database + ". Available releases" + " are: " + StringUtils.join(releases, ",")); } - for (Release dr : result.getResults()) { - if (dr.getRelease() == dataRelease) { - if (dr.getCollections().containsKey(data) && dr.getCollections().get(data).equals(collectionName)) { - throw new LoaderException("Loading new data " + data + " with release " + dataRelease - + " (already populated previously)"); + + // Sanity check don't populate collections already populated, one exception: + // Protein substitution prediction data (i.e., polyphen, sift, revel and alphaMissense) is checked later, since they are loaded + // in the same collection but independently + if (!data.equalsIgnoreCase(PROTEIN_SUBSTITUTION_PREDICTION_DATA)) { + for (Release dr : result.getResults()) { + if (dr.getRelease() == dataRelease) { + if (dr.getCollections().containsKey(data) && dr.getCollections().get(data).equals(collectionName)) { + throw new LoaderException("Loading new data '" + data + "' with release " + dataRelease + + " (already populated previously)"); + } + break; } - break; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java index e1a068147..1fce32a06 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java @@ -76,7 +76,7 @@ public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, Integer p if (queryResult.getNumResults() > 0) { String transcriptId = queryResult.getResults().get(0).getId(); query.setTranscriptsId(Collections.singletonList(transcriptId)); - CellBaseDataResult scoresCellBaseDataResult = proteinDBAdaptor.getSubstitutionScores(query, position, aa); + CellBaseDataResult scoresCellBaseDataResult = proteinDBAdaptor.getSubstitutionScores(query, null, null, position, aa); scoresCellBaseDataResult.setId(transcriptId); return scoresCellBaseDataResult; } else { @@ -101,8 +101,8 @@ public CellBaseDataResult getSequence(ProteinQuery query) throws CellBas public CellBaseDataResult getVariantAnnotation(Variant variant, String ensemblTranscriptId, int aaPosition, String aaReference, String aaAlternate, QueryOptions options, int dataRelease) throws CellBaseException { - CellBaseDataResult proteinVariantAnnotation = proteinDBAdaptor.getVariantAnnotation(ensemblTranscriptId, - aaPosition, aaReference, aaAlternate, options, dataRelease); + CellBaseDataResult proteinVariantAnnotation = proteinDBAdaptor.getVariantAnnotation(variant, + ensemblTranscriptId, aaPosition, aaReference, aaAlternate, options, dataRelease); return proteinVariantAnnotation; } diff --git a/cellbase-lib/src/main/resources/mongodb-indexes.json b/cellbase-lib/src/main/resources/mongodb-indexes.json index f624111f7..f3e3411b3 100644 --- a/cellbase-lib/src/main/resources/mongodb-indexes.json +++ b/cellbase-lib/src/main/resources/mongodb-indexes.json @@ -185,6 +185,8 @@ {"collection": "protein_substitution_prediction", "fields": {"uniprotId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"transcriptId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"aaPosition": 1}, "options": {"background": true}} +{"collection": "protein_substitution_prediction", "fields": {"chromosome": 1, "source": 1, "aaPosition": 1, "position": 1}, "options": {"background": true}} +{"collection": "protein_substitution_prediction", "fields": {"transcriptId": 1, "size": 1}, "options": {"background": true}} {"collection": "snp", "fields": {"id": 1}, "options": {"background": true}} {"collection": "snp", "fields": {"chromosome": 1, "position": 1, "reference": 1}, "options": {"background": true}} diff --git a/cellbase-lib/src/test/resources/index/mongodb-indexes.json b/cellbase-lib/src/test/resources/index/mongodb-indexes.json index f624111f7..365e1aacf 100644 --- a/cellbase-lib/src/test/resources/index/mongodb-indexes.json +++ b/cellbase-lib/src/test/resources/index/mongodb-indexes.json @@ -185,6 +185,8 @@ {"collection": "protein_substitution_prediction", "fields": {"uniprotId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"transcriptId": 1}, "options": {"background": true}} {"collection": "protein_substitution_prediction", "fields": {"aaPosition": 1}, "options": {"background": true}} +{"collection": "protein_substitution_prediction", "fields": {"transcriptId": 1, "size": 1}, "options": {"background": true}} +{"collection": "protein_substitution_prediction", "fields": {"chromosome": 1, "source": 1, "aaPosition": 1, "position": 1}, "options": {"background": true}} {"collection": "snp", "fields": {"id": 1}, "options": {"background": true}} {"collection": "snp", "fields": {"chromosome": 1, "position": 1, "reference": 1}, "options": {"background": true}}