libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
semiglobalalignment.cpp
Go to the documentation of this file.
1/**
2 * \file pappsomspp/processing/specpeptidoms/semiglobalalignment.cpp
3 * \date 24/03/2025
4 * \author Aurélien Berthier
5 * \brief protein to spectrum alignment
6 *
7 * C++ implementation of the SpecPeptidOMS algorithm described in :
8 * (1) Benoist, É.; Jean, G.; Rogniaux, H.; Fertin, G.; Tessier, D. SpecPeptidOMS Directly and
9 * Rapidly Aligns Mass Spectra on Whole Proteomes and Identifies Peptides That Are Not Necessarily
10 * Tryptic: Implications for Peptidomics. J. Proteome Res. 2025.
11 * https://doi.org/10.1021/acs.jproteome.4c00870.
12 */
13
14/*
15 * Copyright (c) 2025 Aurélien Berthier
16 * <aurelien.berthier@ls2n.fr>
17 *
18 *
19 * This program is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published by
21 * the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * This program is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU General Public License for more details.
28 *
29 * You should have received a copy of the GNU General Public License
30 * along with this program. If not, see <http://www.gnu.org/licenses/>.
31 */
32
33#include <QString>
34#include <QStringRef>
35#include <algorithm>
36#include "semiglobalalignment.h"
39#include "correctiontree.h"
42
43void
45{
46 peaks.clear();
47 m_peptideModel.reset();
48 score = 0;
49 begin_shift = 0.0;
50 end_shift = 0.0;
51 shifts.clear();
52 SPC = 0;
53 beginning = 0;
54 end = 0;
55}
56
57
58QString
59pappso::specpeptidoms::Alignment::getPeptideString(const QString &protein_sequence) const
60{
61 return protein_sequence.mid(beginning, end - beginning);
62}
63
64double
66{
67 double sum_of_elems = std::accumulate(shifts.begin(), shifts.end(), 0);
68 return begin_shift + sum_of_elems + end_shift;
69}
70
71
72std::size_t
77
78
80 const ScoreValues &score_values,
81 const pappso::PrecisionPtr precision_ptr,
82 const pappso::AaCode &aaCode)
83 : m_scorevalues(score_values), m_aaCode(aaCode)
84{
85 m_precision_ptr = precision_ptr;
86
87 KeyCell key_cell_init_first;
88 key_cell_init_first.n_row = 0;
89 key_cell_init_first.score = 0;
90 key_cell_init_first.beginning = 0;
91 key_cell_init_first.tree_id = 0;
92 m_interest_cells.push_back(key_cell_init_first);
93}
94
95const std::vector<pappso::specpeptidoms::KeyCell> &
100
101void
103 const SpOMSProtein *protein_ptr)
104{
105 std::size_t sequence_length = protein_ptr->size();
106
107 initFastAlign(spectrum);
108
109 for(std::size_t row_number = 1; row_number <= sequence_length; row_number++)
110 {
111 updateAlignmentMatrix(*protein_ptr,
112 row_number,
113 spectrum.getAaPositions(protein_ptr->at(row_number - 1).code),
114 spectrum,
115 true,
116 protein_ptr);
117 }
118}
119
120void
122{
123 // m_scenario.clear();
124 // TODO don't forget to reset any important variable
125 // m_best_alignment.reset();
126 // m_best_corrected_alignment.reset();
127 // m_best_post_processed_alignment.reset();
128
129 KeyCell key_cell_init;
130 key_cell_init.n_row = 0;
131 key_cell_init.score = m_scorevalues.get(ScoreType::init);
132 key_cell_init.beginning = 0;
133 key_cell_init.tree_id = 0;
134
135 m_interest_cells.resize(spectrum.size());
136 std::fill(m_interest_cells.begin(), m_interest_cells.end(), key_cell_init);
137
138 m_interest_cells.at(0).score = 0;
139
140 // m_location_saver.resetLocationSaver();
141 m_updated_cells.clear();
142}
143
144void
146 std::size_t length2)
147{
148 m_scenario.reserve(length2 + 1, spectrum.size());
149 m_interest_cells.reserve(spectrum.size());
150 m_interest_cells.at(0).n_row = 0;
151 m_interest_cells.at(0).score = 0;
152 m_interest_cells.at(0).beginning = 0;
153 m_interest_cells.at(0).tree_id = 0;
154 for(std::size_t i = 1; i < m_interest_cells.size(); i++)
155 {
156 m_interest_cells.at(i).n_row = 0;
158 m_interest_cells.at(i).beginning = 0;
159 m_interest_cells.at(i).tree_id = 0;
160 }
161 for(std::size_t iter = m_interest_cells.size(); iter < spectrum.size(); iter++)
162 {
163 m_interest_cells.push_back({0, m_scorevalues.get(ScoreType::init), 0, 0});
164 }
165 m_scenario.resetScenario();
166}
167
168void
170 const SpOMSProtein *protein_ptr,
171 const std::size_t beginning,
172 const std::size_t length)
173{
174 try
175 {
176 qDebug();
177 const QString &protein_seq = protein_ptr->getSequence();
178 std::size_t length2;
179 if((qsizetype)(beginning + length) <= protein_seq.size())
180 {
181 length2 = length;
182 }
183 else
184 {
185 length2 = protein_seq.size() - beginning;
186 }
187
188 qDebug();
189 QString sequence_str = protein_seq.sliced(protein_seq.size() - beginning - length2, length2);
190
191 SpOMSProtein sequence("sub_sequence", sequence_str, m_aaCode);
192
193 // std::reverse(sequence.begin(), sequence.end());
194 std::vector<AaPosition> aa_positions;
195 CorrectionTree correction_tree;
196
197 initpreciseAlign(spectrum, length2);
198
199 for(std::size_t row_number = 1; row_number <= length2; row_number++)
200 {
201
202 qDebug() << "row_number - 1=" << row_number - 1 << " sequence.size()=" << sequence.size();
203 // aa = Aa(sequence[row_number - 1].unicode());
204 updateAlignmentMatrix(sequence,
205 row_number,
206 spectrum.getAaPositions(sequence[row_number - 1].code),
207 spectrum,
208 false,
209 protein_ptr);
210 }
211 qDebug();
212 saveBestAlignment(sequence, spectrum, protein_seq.size() - beginning);
213
214 qDebug() << m_scenario.getBestScore() << " " << MIN_ALIGNMENT_SCORE;
215 // Correction : if complementary peaks are used, corrected spectra without one of the two
216 // peaks are generated and aligned. The best alignment is kept.
217 if(m_scenario.getBestScore() >
218 MIN_ALIGNMENT_SCORE) // We only correct alignments with acceptable scores
219 {
220 // we only correct alignment if the sequence has a minimum amino acid diversity
221 if(checkSequenceDiversity(sequence.getSequence(), 5, 2))
222 {
223
224 qDebug();
226 for(std::size_t iter : m_best_alignment.peaks)
227 {
228 if(iter > spectrum.getComplementaryPeak(iter))
229 {
230 break;
231 }
232 else if(std::find(m_best_alignment.peaks.begin(),
233 m_best_alignment.peaks.end(),
234 spectrum.getComplementaryPeak(iter)) !=
235 m_best_alignment.peaks.end())
236 {
237 correction_tree.addPeaks(iter, spectrum.getComplementaryPeak(iter));
238 }
239 }
240 qDebug();
241 std::vector<std::vector<std::size_t>> corrections = correction_tree.getPeaks();
242 if(corrections.size() > 0)
243 {
244 m_best_alignment.score =
245 0; // Reset the best alignment score (we dont want to keep
246 // the original alignment if corrections are needed)
247 qDebug();
248 for(auto peaks_to_remove : corrections)
249 {
250 qDebug();
251 correctAlign(sequence,
252 protein_ptr,
253 spectrum,
254 peaks_to_remove,
255 protein_seq.size() - beginning);
256 qDebug();
257 }
258 qDebug();
260 }
261 }
262 }
263 else
264 {
265 // this sequence has too much redundancy
266 // we have to lower the score
267 m_best_alignment.score = 0;
268 }
269 qDebug();
270 }
271 catch(const pappso::PappsoException &err)
272 {
274 QObject::tr("SemiGlobalAlignment::preciseAlign failed :\n%1").arg(err.qwhat()));
275 }
276}
277
278void
280 const SpOMSProtein *protein_ptr,
281 const SpOMSSpectrum &spectrum,
282 std::vector<std::size_t> &peaks_to_remove,
283 std::size_t offset)
284{
285 std::vector<AaPosition> aa_positions;
286 CorrectionTree correction_tree;
287 std::vector<std::size_t> final_peaks_to_remove;
288
289 KeyCell key_cell_init;
290 key_cell_init.beginning = 0;
291 key_cell_init.n_row = 0;
292 key_cell_init.score = m_scorevalues.get(ScoreType::init);
293 key_cell_init.tree_id = 0;
294
295 std::fill(m_interest_cells.begin(), m_interest_cells.end(), key_cell_init);
296
297 m_interest_cells.at(0).score = 0;
298
299 m_scenario.resetScenario();
300 qDebug();
301 for(qsizetype row_number = 1; row_number <= sequence.size(); row_number++)
302 {
303 qDebug() << row_number - 1 << " " << sequence.size();
304 qDebug() << "sequence[row_number - 1].aa" << (char)sequence[row_number - 1].aa;
305 qDebug();
306 aa_positions = spectrum.getAaPositions(sequence[row_number - 1].code, peaks_to_remove);
307 qDebug();
308 updateAlignmentMatrix(sequence, row_number, aa_positions, spectrum, false, protein_ptr);
309 qDebug();
310 }
311
312 qDebug();
313 // Correction : if complementary peaks are used, corrected spectra without one of the two peaks
314 // are generated and aligned. The best alignment is kept.
315 qDebug() << m_scenario.getBestScore();
316 if(m_scenario.getBestScore() >
317 MIN_ALIGNMENT_SCORE) // We only correct alignments with acceptable scores
318 {
319 qDebug();
320 qDebug() << sequence.getSequence();
321 qDebug() << offset;
322 qDebug() << spectrum.getPrecursorCharge();
323 saveBestAlignment(sequence, spectrum, offset);
324 qDebug();
325 for(std::size_t iter : m_best_alignment.peaks)
326 {
327 qDebug() << "iter:" << iter << "comp:" << spectrum.getComplementaryPeak(iter);
328 if(iter == spectrum.getComplementaryPeak(iter))
329 {
330 continue;
331 }
332 else if(iter > spectrum.getComplementaryPeak(iter))
333 {
334 break;
335 }
336 else if(std::find(m_best_alignment.peaks.begin(),
337 m_best_alignment.peaks.end(),
338 spectrum.getComplementaryPeak(iter)) != m_best_alignment.peaks.end())
339 {
340 correction_tree.addPeaks(iter, spectrum.getComplementaryPeak(iter));
341 }
342 }
343 std::vector<std::vector<std::size_t>> corrections = correction_tree.getPeaks();
344 if(corrections.size() > 0)
345 {
346 for(auto new_peaks_to_remove : corrections)
347 {
348 final_peaks_to_remove = std::vector<std::size_t>(new_peaks_to_remove);
349 final_peaks_to_remove.insert(
350 final_peaks_to_remove.end(), peaks_to_remove.begin(), peaks_to_remove.end());
351 correctAlign(sequence, protein_ptr, spectrum, final_peaks_to_remove, offset);
352 }
353 }
354 else if(m_scenario.getBestScore() > m_best_corrected_alignment.score)
355 {
357 }
358 }
359 qDebug();
360}
361
362void
364 const SpOMSProtein *protein_ptr,
365 std::size_t beginning,
366 std::size_t length,
367 const std::vector<double> &shifts)
368{
369 std::size_t current_SPC = m_best_alignment.SPC;
370 int current_best_score = m_best_alignment.score;
372 for(double precursor_mass_error : shifts)
373 {
374 SpOMSSpectrum corrected_spectrum(spectrum, precursor_mass_error);
375 preciseAlign(corrected_spectrum, protein_ptr, beginning, length);
377 {
379 }
380 }
381 if(m_best_post_processed_alignment.SPC > current_SPC &&
382 m_best_post_processed_alignment.score >= current_best_score)
383 {
385 }
386}
387
388void
391 const std::size_t row_number,
392 const std::vector<AaPosition> &aa_positions,
393 const SpOMSSpectrum &spectrum,
394 const bool fast_align,
395 const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
396{
397 int where = 0;
398 try
399 {
400 int score_found, score_shift, best_score, alt_score, tree_id;
401 uint32_t condition; // FIXME : may be used uninitialised
402 std::size_t best_column, shift, beginning, missing_aas, length, perfect_shift_origin;
403 KeyCell *current_cell_ptr, *tested_cell_ptr;
404 AlignType alignment_type, temp_align_type;
405
406 double smallest_aa_mass = m_aaCode.getMass((std::uint8_t)1);
407
408 m_updated_cells.reserve(aa_positions.size());
409 where = 1;
410 // Computation of the threePeaks condition, see spomsspectrum.h for more details.
411 if(fast_align)
412 {
413 condition = 3;
414 if(row_number > 1)
415 {
416 qDebug() << (char)sequence.at(row_number - 2).aa;
417 qDebug() << "condition" << condition << "aa" << (char)sequence.at(row_number - 2).aa
418 << sequence.at(row_number - 2).code;
419 condition += 2 << sequence.at(row_number - 2).code;
420 qDebug();
421 qDebug() << "condition" << condition;
422 }
423 }
424 where = 2;
425 for(std::vector<AaPosition>::const_iterator aa_position = aa_positions.begin();
426 aa_position != aa_positions.end();
427 aa_position++)
428 {
429 qDebug() << "l_peak" << aa_position->l_peak << "r_peak" << aa_position->r_peak << "l_mass"
430 << aa_position->l_mass << "l_support" << aa_position->l_support << "condition"
431 << aa_position->condition;
432
433 where = 3;
434 if(((condition & aa_position->condition) != 0) ||
435 !fast_align) // Verification of the threePeaks condition (only during first alignment).
436 {
437 if(fast_align)
438 {
439 qDebug() << "threePeaks condition verified";
440 }
441
442 current_cell_ptr = &m_interest_cells.at(aa_position->r_peak);
443 if(spectrum.peakType(aa_position->r_peak) ==
444 specglob::ExperimentalSpectrumDataPointType::both) // The score used are different
445 // depending on the right peak
446 // type (simple or double).
447 {
448 qDebug() << "double peak";
449 score_found = m_scorevalues.get(ScoreType::foundDouble);
451 }
452 else
453 {
454 qDebug() << "single peak";
455 score_found = m_scorevalues.get(ScoreType::found);
456 score_shift = m_scorevalues.get(ScoreType::foundShift);
457 }
458
459 // not found case (always computed)
460 best_column = aa_position->r_peak;
461 best_score = current_cell_ptr->score + (row_number - current_cell_ptr->n_row) *
463 beginning = current_cell_ptr->beginning;
464 tree_id = current_cell_ptr->tree_id;
465 alignment_type = AlignType::notFound;
466 qDebug() << "not found" << best_score;
467
468 // found case (Can only happen if the left peak is supported)
469 if(aa_position->l_support)
470 {
471 tested_cell_ptr = &m_interest_cells.at(aa_position->l_peak);
472 if(aa_position->l_peak == 0)
473 {
474 alt_score = tested_cell_ptr->score + score_found;
475 }
476 else
477 {
478 if(tested_cell_ptr->n_row == row_number - 1)
479 {
480 alt_score = tested_cell_ptr->score +
481 (row_number - tested_cell_ptr->n_row - 1) *
483 score_found;
484 }
485 else
486 {
487 alt_score = tested_cell_ptr->score +
488 (row_number - tested_cell_ptr->n_row - 1) *
490 score_shift;
491 }
492 }
493 if(alt_score >= best_score) // If the found case improves the score, the new value
494 // of the key cell is computed.
495 {
496 alignment_type = AlignType::found;
497 best_score = alt_score;
498 best_column = aa_position->l_peak;
499 if(best_column == 0)
500 {
501 if(row_number < ALIGNMENT_SURPLUS)
502 {
503 beginning = 0;
504 }
505 else
506 {
507 beginning = std::max((std::size_t)(row_number - ALIGNMENT_SURPLUS),
508 (std::size_t)0);
509 }
510 if(fast_align)
511 {
512 tree_id = m_location_saver.getNextTree();
513 }
514 }
515 else
516 {
517 beginning = tested_cell_ptr->beginning;
518 tree_id = tested_cell_ptr->tree_id;
519 }
520 qDebug() << "found" << best_score << "from" << best_column << beginning
521 << tree_id;
522 }
523 }
524
525 where = 4;
526 // generic shift case (all shifts are tested)
527 if(aa_position->l_support)
528 {
529 shift = 1;
530 }
531 else
532 {
533 shift = 0;
534 }
535 while(shift < aa_position->l_peak)
536 {
537 tested_cell_ptr = &m_interest_cells.at(aa_position->l_peak - shift);
538 // verification saut parfait
539 if(perfectShiftPossible(sequence,
540 spectrum,
541 tested_cell_ptr->n_row,
542 row_number,
543 aa_position->l_peak - shift,
544 aa_position->r_peak) &&
546 {
547 alt_score = tested_cell_ptr->score +
548 (row_number - tested_cell_ptr->n_row - 1) *
550 score_found;
551 temp_align_type = AlignType::perfectShift;
552 }
553 else
554 {
555 alt_score = tested_cell_ptr->score +
556 (row_number - tested_cell_ptr->n_row - 1) *
558 score_shift;
559 temp_align_type = AlignType::shift;
560 }
561 if(alt_score > best_score)
562 {
563 alignment_type = temp_align_type;
564 best_score = alt_score;
565 best_column = aa_position->l_peak - shift;
566 beginning = tested_cell_ptr->beginning;
567 tree_id = tested_cell_ptr->tree_id;
568 qDebug() << "shift" << best_score << "from" << best_column;
569 }
570 shift++;
571 }
572
573 where = 5;
574 // case shift from column 0 (no penalties if all precedent amino acids are missed)
575 tested_cell_ptr = &m_interest_cells.at(0);
576 // verification saut parfait
577 if(aa_position->r_peak <= TOL_PEAKS_MISSING_FIRST_COLUMN)
578 {
579 perfect_shift_origin =
580 perfectShiftPossibleFrom0(sequence, spectrum, row_number, aa_position->r_peak);
581 }
582 else
583 {
584 perfect_shift_origin = row_number;
585 }
586
587 if(perfect_shift_origin != row_number)
588 {
589 alt_score = tested_cell_ptr->score + score_found;
590 temp_align_type = AlignType::perfectShift;
591 }
592 else
593 {
594 alt_score = tested_cell_ptr->score + score_shift;
595 temp_align_type = AlignType::shift;
596 }
597
598 where = 6;
599 if(alt_score > best_score)
600 {
601 qDebug() << "shift" << alt_score << "from 0";
602 alignment_type = temp_align_type;
603 best_score = alt_score;
604 best_column = 0;
605 missing_aas =
606 std::floor((aa_position->l_mass -
608 smallest_aa_mass);
609 qDebug() << "missing aas" << missing_aas;
610 if(row_number < ALIGNMENT_SURPLUS + missing_aas)
611 {
612 beginning = 0;
613 }
614 else
615 {
616 beginning =
617 std::max((std::size_t)(row_number - missing_aas - ALIGNMENT_SURPLUS),
618 (std::size_t)0);
619 }
620 where = 7;
621 if(fast_align)
622 {
623 tree_id = m_location_saver.getNextTree();
624 }
625 }
626
627 where = 8;
628 if(best_column != aa_position->r_peak)
629 {
630 m_updated_cells.push_back(
631 {aa_position->r_peak, {row_number, best_score, beginning, tree_id}});
632 }
633
634 where = 9;
635 if(best_score > m_location_saver.getMinScore(tree_id) && fast_align)
636 {
637 length =
638 row_number - beginning + 1 +
639 std::ceil(spectrum.getMissingMass(aa_position->r_peak) / smallest_aa_mass) +
641 where = 10;
642 m_location_saver.addLocation(beginning, length, tree_id, best_score, protein_ptr);
643 }
644 else if(!fast_align)
645 {
646
647 where = 11;
648 if(alignment_type == AlignType::perfectShift && best_column == 0)
649 {
650 m_scenario.saveOrigin(row_number,
651 aa_position->r_peak,
652 perfect_shift_origin,
653 0,
654 best_score,
656 }
657 else
658 {
659 m_scenario.saveOrigin(row_number,
660 aa_position->r_peak,
661 m_interest_cells.at(best_column).n_row,
662 best_column,
663 best_score,
664 alignment_type);
665 }
666 }
667 }
668 }
669
670 where = 30;
671 // Update row number in column 0
672 m_updated_cells.push_back({0, {row_number, 0, 0, 0}});
673
674 // Save updated key cells in the matrix
675 while(m_updated_cells.size() > 0)
676 {
677 qDebug() << m_interest_cells.size() << " " << m_updated_cells.back().first
678 << m_updated_cells.back().second.n_row << m_updated_cells.back().second.score
679 << m_updated_cells.back().second.beginning
680 << m_updated_cells.back().second.tree_id;
681 m_interest_cells.at(m_updated_cells.back().first) = m_updated_cells.back().second;
682 m_updated_cells.pop_back();
683 }
684 where++;
685 }
686 catch(const std::exception &error)
687 {
689 QObject::tr("updateAlignmentMatrix failed std::exception :\n%1 %2")
690 .arg(where)
691 .arg(error.what()));
692 }
693 catch(const pappso::PappsoException &err)
694 {
696 QObject::tr("updateAlignmentMatrix failed :\n%1").arg(err.qwhat()));
697 }
698}
699
700bool
703 const SpOMSSpectrum &spectrum,
704 const std::size_t origin_row,
705 const std::size_t current_row,
706 const std::size_t l_peak,
707 const std::size_t r_peak) const
708{
709 try
710 {
711 double missing_mass = 0;
712 auto it_end = sequence.begin() + current_row;
713 for(auto iter = sequence.begin() + origin_row; (iter != it_end) && (iter != sequence.end());
714 iter++)
715 {
716 missing_mass += iter->mass; // Aa(iter->unicode()).getMass();
717 }
718 if(missing_mass > 0)
719 {
720 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
721 return mz_range.contains(spectrum.getMZShift(l_peak, r_peak));
722 }
723 else
724 {
725 return false;
726 }
727 }
728 catch(const std::exception &error)
729 {
731 QObject::tr("perfectShiftPossible failed std exception:\n%1").arg(error.what()));
732 }
733 catch(const pappso::PappsoException &err)
734 {
736 QObject::tr("perfectShiftPossible failed :\n%1").arg(err.qwhat()));
737 }
738}
739
740std::size_t
743 const SpOMSSpectrum &spectrum,
744 const std::size_t current_row,
745 const std::size_t r_peak) const
746{
747 std::size_t perfect_shift_origin = current_row;
748 double missing_mass = spectrum.getMZShift(0, r_peak);
749 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
750 double aa_mass = 0;
751 while(aa_mass < missing_mass && perfect_shift_origin > 0 && !mz_range.contains(aa_mass))
752 {
753 aa_mass += sequence.at(perfect_shift_origin - 1)
754 .mass; // Aa(sequence.at(perfect_shift_origin - 1).unicode()).getMass();
755 perfect_shift_origin--;
756 }
757 if(mz_range.contains(aa_mass))
758 {
759 return perfect_shift_origin;
760 }
761 else
762 {
763 return current_row;
764 }
765}
766
767std::size_t
770 const SpOMSSpectrum &spectrum,
771 std::size_t end_row,
772 std::size_t end_peak) const
773{
774 try
775 {
776 std::size_t perfect_shift_end = end_row + 1;
777 double missing_mass = spectrum.getMissingMass(end_peak);
778 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
779 double aa_mass = 0;
780 while(aa_mass < missing_mass && perfect_shift_end < (std::size_t)sequence.size() &&
781 !mz_range.contains(aa_mass))
782 {
783 aa_mass += sequence.at(perfect_shift_end - 1)
784 .mass; // Aa(sequence.at(perfect_shift_end - 1).unicode()).getMass();
785 perfect_shift_end++;
786 }
787 if(mz_range.contains(aa_mass))
788 {
789 return perfect_shift_end - 1;
790 }
791 else
792 {
793 return end_row;
794 }
795 }
796 catch(const pappso::PappsoException &err)
797 {
799 QObject::tr("perfectShiftPossibleEnd failed :\n%1").arg(err.qwhat()));
800 }
801}
802
806
812
818
819void
822 const SpOMSSpectrum &spectrum,
823 std::size_t offset)
824{
825 qDebug();
826 m_best_alignment.m_peptideModel.reset();
827 m_best_alignment.peaks.clear();
828 m_best_alignment.shifts.clear();
829 std::size_t previous_row; // FIXME : may be used uninitialised
830 std::size_t previous_column = 0;
831 std::size_t perfect_shift_end;
832 std::pair<std::vector<ScenarioCell>, int> best_alignment = m_scenario.getBestAlignment();
833 m_best_alignment.score = best_alignment.second;
834 std::vector<SpOMSAa> skipped_aa;
835 double skipped_mass;
836 // Retrieving beginning and end
837 if(best_alignment.first.front().previous_row > offset)
838 {
840 QString("best_alignment.first.front().previous_row > offset %1 %2")
841 .arg(offset)
842 .arg(best_alignment.first.front().previous_row));
843 }
844 if(best_alignment.first.back().previous_row > offset)
845 {
847 QString("best_alignment.first.back().previous_row > offset %1 %2")
848 .arg(offset)
849 .arg(best_alignment.first.back().previous_row));
850 }
851 m_best_alignment.beginning = offset - best_alignment.first.front().previous_row;
852 m_best_alignment.end = offset - best_alignment.first.back().previous_row - 1;
853
854 qDebug();
855 AminoAcidModel aa_model;
856 aa_model.m_massDifference = 0;
857 // Filling temp_interpretation and peaks vectors
858 for(auto cell : best_alignment.first)
859 {
860 switch(cell.alignment_type)
861 {
862 case AlignType::found:
863 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
864 aa_model.m_massDifference = 0;
865 aa_model.m_skipped = false;
866 m_best_alignment.m_peptideModel.push_back(aa_model);
867 if(previous_row > cell.previous_row + 1)
868 {
869 skipped_mass = sequence.at(previous_row - 1)
870 .mass; // Aa(sequence.at(previous_row - 1).unicode()).getMass();
871 skipped_aa =
872 sequence.sliced(cell.previous_row, previous_row - cell.previous_row - 1);
873 aa_model.m_massDifference = 0;
874 aa_model.m_skipped = true;
875 for(auto aa : skipped_aa)
876 {
877 aa_model.m_aminoAcid = aa.aa;
878 m_best_alignment.m_peptideModel.push_back(aa_model);
879 skipped_mass += aa.mass; // Aa(aa.unicode()).getMass();
880 }
881 m_best_alignment.m_peptideModel.back().m_massDifference =
882 spectrum.getMZShift(cell.previous_column, previous_column) - skipped_mass;
883 }
884 m_best_alignment.peaks.push_back(cell.previous_column);
885 break;
887 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
888 aa_model.m_massDifference = 0;
889 aa_model.m_skipped = true;
890 m_best_alignment.m_peptideModel.push_back(aa_model);
891 break;
892 case AlignType::shift:
893
894 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
895 aa_model.m_massDifference = spectrum.getMZShift(cell.previous_column, previous_column) -
896 aa_model.m_aminoAcid.getMass();
897 aa_model.m_skipped = false;
898 m_best_alignment.m_peptideModel.push_back(aa_model);
899 m_best_alignment.peaks.push_back(cell.previous_column);
900 m_best_alignment.shifts.push_back(
901 spectrum.getMZShift(cell.previous_column, previous_column) -
902 sequence.at(previous_row - 1).mass);
903 break;
905 m_best_alignment.peaks.push_back(cell.previous_column);
906 skipped_aa = sequence.sliced(cell.previous_row, previous_row - cell.previous_row);
907 std::reverse(skipped_aa.begin(), skipped_aa.end());
908 aa_model.m_massDifference = 0;
909 aa_model.m_skipped = false;
910 for(auto aa : skipped_aa)
911 {
912 aa_model.m_aminoAcid = aa.aa;
913 m_best_alignment.m_peptideModel.push_back(aa_model);
914 }
915 break;
916 case AlignType::init:
917 previous_row = cell.previous_row;
918 previous_column = cell.previous_column;
919 m_best_alignment.peaks.push_back(cell.previous_column);
920 break;
921 }
922 previous_row = cell.previous_row;
923 previous_column = cell.previous_column;
924 }
925 std::reverse(m_best_alignment.peaks.begin(), m_best_alignment.peaks.end());
926 std::reverse(m_best_alignment.m_peptideModel.begin(), m_best_alignment.m_peptideModel.end());
927
928 qDebug();
929 // Compute begin_shift and end_shift
930 MzRange zero(0, m_precision_ptr);
931 m_best_alignment.begin_shift = spectrum.getMZShift(0, m_best_alignment.peaks.front());
932 m_best_alignment.end_shift = spectrum.getMissingMass(m_best_alignment.peaks.back());
933 if(zero.contains(m_best_alignment.end_shift))
934 {
935 m_best_alignment.end_shift = 0;
936 }
937
938 qDebug();
939 // Computing SPC
940 m_best_alignment.SPC = 0;
941 for(auto peak : m_best_alignment.peaks)
942 {
943 switch(spectrum.at(peak).type)
944 {
946 qDebug() << peak << "native";
947 m_best_alignment.SPC += 1;
948 break;
950 qDebug() << peak << "both";
951 m_best_alignment.SPC += 2;
952 break;
954 qDebug() << peak << "synthetic";
955 break;
957 qDebug() << peak << "symmetric";
958 m_best_alignment.SPC += 1;
959 break;
960 }
961 }
962
963 qDebug();
964 // Final check of the end shift
965 if(m_best_alignment.end_shift > 0)
966 {
967 perfect_shift_end = perfectShiftPossibleEnd(sequence,
968 spectrum,
969 best_alignment.first.front().previous_row,
970 m_best_alignment.peaks.back());
971 if(perfect_shift_end != best_alignment.first.front().previous_row)
972 {
973 skipped_aa =
974 sequence.sliced(best_alignment.first.front().previous_row,
975 perfect_shift_end - best_alignment.first.front().previous_row);
976 aa_model.m_massDifference = 0;
977 aa_model.m_skipped = true;
978 for(auto aa = skipped_aa.begin(); aa != skipped_aa.end(); aa++)
979 {
980 aa_model.m_aminoAcid = aa->aa;
981 m_best_alignment.m_peptideModel.push_back(aa_model);
982 }
983 m_best_alignment.beginning = offset - perfect_shift_end;
984 m_best_alignment.end_shift = 0;
985 }
986 else
987 {
989 }
990 }
991
992 qDebug();
993 // Writing final interpretation
994 if(m_best_alignment.end_shift > 0)
995 {
996 m_best_alignment.m_peptideModel.setNterShift(m_best_alignment.end_shift);
997 }
998
999 std::reverse(m_best_alignment.m_peptideModel.begin(), m_best_alignment.m_peptideModel.end());
1000 if(m_best_alignment.begin_shift > 0)
1001 {
1002 m_best_alignment.m_peptideModel.setCterShift(m_best_alignment.begin_shift);
1003 }
1004
1005 m_best_alignment.m_peptideModel.setPrecursorMass(spectrum.getPrecursorMass());
1006 qDebug();
1007}
1008
1014
1015std::vector<double>
1017 const Alignment &alignment,
1018 const QString &protein_seq)
1019{
1020 // qDebug() << protein_seq;
1021 if(alignment.end > (std::size_t)protein_seq.size())
1022 {
1023 throw pappso::ExceptionOutOfRange(QString("alignment.end > protein_seq.size() %1 %2")
1024 .arg(alignment.end)
1025 .arg(protein_seq.size()));
1026 }
1027 std::vector<double> potential_mass_errors(alignment.shifts);
1028 double shift = alignment.end_shift;
1029 std::size_t index;
1030 if(alignment.beginning > 0)
1031 { // -1 on unsigned int makes it wrong
1032 index = alignment.beginning - 1;
1033 while(shift > 0 && index > 0)
1034 {
1035 potential_mass_errors.push_back(shift);
1036 // qDebug() << " shift=" << shift << " index=" << index
1037 // << " letter=" << protein_seq.at(index).toLatin1();
1038 shift -= aa_code.getMass(
1039 protein_seq.at(index).toLatin1()); // Aa(protein_seq.at(index).unicode()).getMass();
1040 index--;
1041 }
1042 }
1043
1044 // qDebug() << "second";
1045 shift = alignment.begin_shift;
1046 index = alignment.end + 1;
1047 while(shift > 0 && index < (std::size_t)protein_seq.size())
1048 {
1049 potential_mass_errors.push_back(shift);
1050 qDebug() << " shift=" << shift << " index=" << index
1051 << " letter=" << protein_seq.at(index).toLatin1();
1052 shift -= aa_code.getMass(
1053 protein_seq.at(index).toLatin1()); // Aa(protein_seq.at(index).unicode()).getMass();
1054 index++;
1055 }
1056 // qDebug();
1057 return potential_mass_errors;
1058}
1059
1060bool
1062 std::size_t window,
1063 std::size_t minimum_aa_diversity)
1064{
1065 qDebug() << "sequence=" << sequence << " window=" << window
1066 << " minimum_aa_diversity=" << minimum_aa_diversity;
1067 if(sequence.size() < window)
1068 return false;
1069 auto it_begin = sequence.begin();
1070 auto it_end = sequence.begin() + window;
1071 QString window_copy(sequence.mid(0, window));
1072 while(it_end != sequence.end())
1073 {
1074 std::partial_sort_copy(it_begin, it_end, window_copy.begin(), window_copy.end());
1075
1076 qDebug() << window_copy;
1077 std::size_t uniqueCount =
1078 std::unique(window_copy.begin(), window_copy.end()) - window_copy.begin();
1079
1080 qDebug() << uniqueCount;
1081 if(uniqueCount < minimum_aa_diversity)
1082 return false;
1083 it_begin++;
1084 it_end++;
1085 }
1086 return true;
1087}
1088
1089const std::vector<pappso::specpeptidoms::KeyCell> &
1092 const std::size_t row_number,
1093 const std::vector<AaPosition> &aa_positions,
1094 const SpOMSSpectrum &spectrum,
1095 const bool fast_align,
1096 const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
1097{
1098 updateAlignmentMatrix(sequence, row_number, aa_positions, spectrum, fast_align, protein_ptr);
1099 return getInterestCells();
1100}
collection of integer code for each amino acid 0 => null 1 to 20 => amino acid sorted by there mass (...
Definition aacode.h:44
double getMass(uint8_t aa_code) const
get the mass of the amino acid given its integer code the amino acid can bear some modification (if a...
Definition aacode.cpp:224
pappso_double getMass() const override
Definition aa.cpp:90
bool contains(pappso_double) const
Definition mzrange.cpp:115
virtual const QString & qwhat() const
std::vector< std::vector< std::size_t > > getPeaks() const
void addPeaks(std::size_t peak1, std::size_t peak2)
void initpreciseAlign(const SpOMSSpectrum &spectrum, std::size_t length2)
function made for testing the preciseAlign process, initiate the variables for alignment
const Alignment & getBestAlignment() const
Returns a const ref to m_best_alignment.
Scenario getScenario() const
Returns a copy of m_scenario.
std::size_t perfectShiftPossibleEnd(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t end_row, std::size_t end_peak) const
indicates if a perfect shift is possible between the provided positions
void updateAlignmentMatrix(const pappso::specpeptidoms::SpOMSProtein &sequence, const std::size_t row_number, const std::vector< AaPosition > &aa_positions, const SpOMSSpectrum &spectrum, const bool fast_align, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
updates the scores of the alignment matrix for a given amino acid as well as the location heap/scenar...
void postProcessingAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, std::size_t beginning, std::size_t length, const std::vector< double > &shifts)
performs the post-processing : generates corrected spectra and align them
void preciseAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, const std::size_t beginning, const std::size_t length)
performs the second alignment search between a protein subsequence and a spectrum.
void correctAlign(const SpOMSProtein &protein_subseq, const SpOMSProtein *protein_ptr, const SpOMSSpectrum &spectrum, std::vector< std::size_t > &peaks_to_remove, std::size_t offset)
Recursively performs the correction of the alignment.
const std::vector< KeyCell > & oneAlignStep(const pappso::specpeptidoms::SpOMSProtein &sequence, const std::size_t row_number, const std::vector< AaPosition > &aa_positions, const SpOMSSpectrum &spectrum, const bool fast_align, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
function made for testing the fastAlign process, process one line and return the alignment matrix
const std::vector< KeyCell > & getInterestCells() const
convenient function for degub purpose
bool perfectShiftPossible(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t origin_row, const std::size_t current_row, const std::size_t l_peak, const std::size_t r_peak) const
indicates if a perfect shift is possible between the provided positions
std::vector< std::pair< std::size_t, KeyCell > > m_updated_cells
void fastAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr)
perform the first alignment search between a protein sequence and a spectrum. The member location hea...
static bool checkSequenceDiversity(const QString &sequence, std::size_t window, std::size_t minimum_aa_diversity)
check that the sequence has a minimum of amino acid checkSequenceDiversity
std::size_t perfectShiftPossibleFrom0(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t current_row, const std::size_t r_peak) const
indicates if a perfect shift is possible from the spectrum beginning to the provided peak....
void initFastAlign(const SpOMSSpectrum &spectrum)
function made for testing the fastAlign process, initiate the variables for alignment
static std::vector< double > getPotentialMassErrors(const pappso::AaCode &aa_code, const Alignment &alignment, const QString &protein_seq)
Returns a list of the potential mass errors corresponding to the provided alignment in the provided p...
void saveBestAlignment(const SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t offset)
Stores the best alignment from m_scenario in m_best_alignment.
SemiGlobalAlignment(const ScoreValues &score_values, const pappso::PrecisionPtr precision_ptr, const AaCode &aaCode)
LocationSaver getLocationSaver() const
Returns a copy of m_location_saver.
const QString & getSequence() const
std::vector< SpOMSAa > sliced(std::size_t position, std::size_t length) const
double getMZShift(std::size_t l_peak, std::size_t r_peak) const
Returns the mz difference between two peaks.
uint getPrecursorCharge() const
Returns the spectrum's precursor's charge.
double getMissingMass(std::size_t peak) const
Returns the missing mass between a peak and the precursor's mass (shift at the end).
std::size_t getComplementaryPeak(std::size_t peak) const
const std::vector< AaPosition > & getAaPositions(std::uint8_t aa_code) const
Returns the list of aa_positions for a given amino acid code.
specglob::ExperimentalSpectrumDataPointType peakType(std::size_t indice) const
Returns the type of one of the spectrum's peaks.
@ synthetic
does not correspond to existing peak, for computational purpose
Definition types.h:85
@ both
both, the ion and the complement exists in the original spectrum
Definition types.h:83
@ symmetric
new peak : computed symmetric mass from a corresponding native peak
Definition types.h:81
const uint ALIGNMENT_SURPLUS(5)
const int MIN_ALIGNMENT_SCORE(15)
const uint TOL_PEAKS_MISSING(4)
const uint TOL_PEAKS_MISSING_FIRST_COLUMN(5)
const pappso_double MHPLUS(1.007276466879)
const pappso_double MPROTIUM(1.007825032241)
const pappso_double MASSOXYGEN(15.99491461956)
const PrecisionBase * PrecisionPtr
Definition precision.h:122
void reset()
reinitialize to default score_values
QString getPeptideString(const QString &protein_sequence) const
convenient function to get peptide sequence from location
double getNonAlignedMass() const
convenient function to get the remaining non explained mass shift
std::vector< std::size_t > peaks
std::size_t getPositionStart() const
get position of start on the protein sequence