libStatGen Software 1
TrimSequence.h
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef _TRIMSEQUENCE_H
19#define _TRIMSEQUENCE_H
20
21#include <assert.h>
22#include <stdint.h>
23#include <stdlib.h>
24
25#ifndef __WIN32__
26#include <unistd.h>
27#endif
28
29///
30/// TrimSequence is a templated function to find bases
31/// which are below a certain moving mean threshold,
32/// and can be applied to either end of the sequence string.
33///
34/// @param sequence is the input sequence
35/// @param meanValue is the value below which we wish to trim.
36/// @return the iterator of the location at which untrimmed values begin
37///
38/// Details:
39///
40/// trimFromLeft is a bool indicating which direction we wish
41/// to trim. true -> left to right, false is right to left.
42///
43/// The code is convoluted enough here, so for implementation
44/// and testing sanity, the following definitions are made:
45///
46/// When trimFromLeft is true:
47/// result == sequence.begin() implies no trimming
48/// result == sequence.end() implies all values are trimmed
49///
50/// When trimFromLeft is false:
51/// result == sequence.begin() implies all values are trimmed
52/// result == sequence.end() no values are trimmed
53///
54/// result will always be in the range [sequence.begin() , sequence.end())
55/// (begin is inclusive, end is exclusive).
56///
57/// NOTE: See TrimSequence.h and test/TrimSequence_test.cpp for examples
58///
59/// THIS CODE IS EXCEPTIONALLY FRAGILE. DO NOT ATTEMPT TO FIX OR
60/// IMPROVE WITHOUT INCLUDING DOCUMENTED, UNDERSTANABLE TEST CASES THAT CLEARLY
61/// SHOW WHY OR WHY NOT SOMETHING WORKS.
62///
63template<typename sequenceType, typename meanValueType>
64typename sequenceType::iterator trimSequence(sequenceType &sequence, meanValueType meanValue, const bool trimFromLeft)
65{
66 const int howManyValues = 4; // this is used in signed arithmetic below
67 int windowThreshold = howManyValues * meanValue;
68 int64_t sumOfWindow = 0;
69 typename sequenceType::iterator it;
70
71 //
72 // Sanity check to weed out what otherwise would be
73 // a large number of boundary checks below. If the input
74 // is too small, just punt it back to the caller. Technically,
75 // we can still trim, but we'd just do the simple iteration
76 // loop. Save that for when we care.
77 //
78
79 if (sequence.size() < (size_t) howManyValues)
80 return trimFromLeft? sequence.begin() : sequence.end();
81
82 typename sequenceType::iterator sequenceBegin;
83 typename sequenceType::iterator sequenceEnd;
84
85 // The algorithm should be clear and efficient
86 // so it does not bother to write codes for two directions.
87 // It that way, we avoid thinking trimming from left and right interchangably.
88 if (trimFromLeft)
89 {
90 // sequenceBegin is inclusive, sequenceEnd is exclusive,
91 sequenceBegin = sequence.begin();
92 sequenceEnd = sequence.end();
93
94 for (it = sequenceBegin; it < sequenceBegin + howManyValues; it++)
95 sumOfWindow += *it;
96
97 for (; it < sequenceEnd; it ++)
98 {
99 if (sumOfWindow > windowThreshold)
100 break;
101 sumOfWindow += *it;
102 sumOfWindow -= *(it - howManyValues);
103 }
104 // here it is in the range of [sequenceBegin+howManyValues, sequenceEnd] inclusively
105 // the range is also [sequence.begin() + howManyValues, sequence.end()]
106 while (*(it-1) >= meanValue && (it-1) >= sequenceBegin)
107 it--;
108 }
109 else
110 {
111 sequenceBegin = sequence.end() - 1;
112 sequenceEnd = sequence.begin() - 1;
113
114 for (it = sequenceBegin; it > sequenceBegin - howManyValues; it--)
115 sumOfWindow += *it;
116
117 for (; it > sequenceEnd; it--)
118 {
119 if (sumOfWindow > windowThreshold)
120 break;
121 sumOfWindow += *it;
122 sumOfWindow -= *(it + howManyValues);
123 }
124
125 // here it is in the range of [sequenceEnd, sequenceBegin - howManyValues] inclusively
126 // the range is also [sequence.begin() -1, sequence.end() - 1 - howManyValues]
127 while (*(it+1) >= meanValue && (it+1) <= sequenceBegin)
128 it ++;
129 // note, the return value should in the range [sequence.begin(), sequence.end()]
130 it += 1;
131 }
132
133 // 'it' may be sequence.end() in some cases
134 assert(it >= sequence.begin() && it <= sequence.end());
135
136 return it;
137}
138
139#endif