libStatGen Software 1
glfHandler.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include "glfHandler.h"
19#include "BaseQualityHelper.h"
20
21char glfHandler::translateBase[16] = {0, 1, 2, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0};
22char glfHandler::backTranslateBase[5] = { 15, 1, 2, 4, 8 };
23unsigned char glfHandler::nullLogLikelihoods[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
24double glfHandler::nullLikelihoods[10] = {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.};
25
26glfHandler::glfHandler()
27{
28 isStub = true;
29 sections = 0;
30 currentSection = 0;
31 maxPosition = position = endOfSection = 0;
32}
33
34glfHandler::~glfHandler()
35{
36 // Not safe to close the file here in case a copy of the file was generated
37 // if (isOpen())
38 // Close();
39}
40
41bool glfHandler::Open(const String & filename)
42{
43 isStub = false;
44 handle = ifopen(filename, "rb");
45
46 if (handle == NULL)
47 {
48 isStub = true;
49 return false;
50 }
51
52 if (!ReadHeader())
53 ifclose(handle);
54
55 endOfSection = true;
56
57 return handle != NULL;
58}
59
60void glfHandler::OpenStub()
61{
62 isStub = true;
63 handle = NULL;
64
65 endOfSection = true;
66 data.recordType = 0;
67 maxPosition = 1999999999;
68 position = maxPosition + 1;
69}
70
71bool glfHandler::Create(const String & filename)
72{
73 isStub = false;
74 // glf is in BGZF format.
75 handle = ifopen(filename, "wb", InputFile::BGZF);
76
77 if (handle == NULL)
78 {
79 isStub = true;
80 return false;
81 }
82
83 WriteHeader();
84
85 return handle != NULL;
86}
87
88bool glfHandler::isOpen()
89{
90 return handle != NULL;
91}
92
93bool glfHandler::ReadHeader()
94{
95 if (isStub)
96 return true;
97
98 if (handle == NULL)
99 return false;
100
101 char magicNumber[4];
102
103 if (ifread(handle, magicNumber, 4) != 4)
104 {
105 errorMsg = "unexpected end of file";
106 return false;
107 }
108
109 if (magicNumber[0] != 'G' || magicNumber[1] != 'L' || magicNumber[2] != 'F')
110 {
111 errorMsg = "invalid format";
112 return false;
113 }
114
115 if (magicNumber[3] != 3)
116 {
117 errorMsg = "unsupported version";
118 return false;
119 }
120
121 unsigned int headerLength = 0;
122
123 if (ifread(handle, &headerLength, 4) != 4)
124 {
125 errorMsg = "unexpected end of file";
126 return false;
127 }
128
129 if (headerLength > 1024 * 1024)
130 {
131 errorMsg = "header too large -- bailing";
132 return false;
133 }
134
135 header.SetLength(headerLength + 1);
136 header[headerLength] = 0;
137
138 if (headerLength && ifread(handle, header.LockBuffer(headerLength + 1), headerLength) != headerLength)
139 {
140 errorMsg = "unexpected end of file";
141 return false;
142 }
143
144 return true;
145}
146
147void glfHandler::Close()
148{
149 if (isOpen())
150 ifclose(handle);
151}
152
153void glfHandler::Rewind()
154{
155 if (isOpen())
156 {
157 ifrewind(handle);
158
159 if (!ReadHeader())
160 ifclose(handle);
161
162 endOfSection = true;
163 }
164}
165
166bool glfHandler::NextSection()
167{
168 if (isStub)
169 {
170 endOfSection = true;
171 data.recordType = 0;
172 maxPosition = 1999999999;
173 position = maxPosition + 1;
174 return true;
175 }
176
177 while (!endOfSection && !ifeof(handle))
178 NextEntry();
179
180 endOfSection = false;
181
182 int labelLength = 0;
183
184 currentSection++;
185 position = 0;
186 if (ifread(handle, &labelLength, sizeof(int)) == sizeof(int))
187 {
188 ifread(handle, label.LockBuffer(labelLength+1), labelLength * sizeof(char));
189 label.UnlockBuffer();
190
191 maxPosition = 0;
192 ifread(handle, &maxPosition, sizeof(int));
193
194 return ((maxPosition > 0) && !ifeof(handle));
195 }
196
197 return false;
198}
199
200bool glfHandler::NextBaseEntry()
201{
202 bool result = true;
203
204 do
205 {
206 result = NextEntry();
207 }
208 while (result && data.recordType == 2);
209
210 return result;
211}
212
213
214bool glfHandler::NextEntry()
215{
216 if (isStub)
217 return false;
218
219 // Read record type
220 if (endOfSection || (ifread(handle, &data, 1) != 1))
221 {
222 endOfSection = true;
223 data.recordType = 0;
224 position = maxPosition + 1;
225 return false;
226 }
227
228 // printf("%d/%d\n", data.recordType, data.refBase);
229
230 if (position > maxPosition)
231 return true;
232
233 switch (data.recordType)
234 {
235 case 0 :
236 endOfSection = true;
237 position = maxPosition + 1;
238 return true;
239 case 1 :
240 if (ifread(handle,((char *) &data) + 1, sizeof(data) - 1) == sizeof(data) - 1)
241 {
242 data.refBase = translateBase[data.refBase];
243
244 for (int i = 0; i < 10; i++)
245 likelihoods[i] = bQualityConvertor.toDouble(data.lk[i]);
246
247 position = position + data.offset;
248 return true;
249 }
250
251 // Premature end of file
252 data.recordType = 0;
253 position = maxPosition + 1;
254 return false;
255 case 2 :
256 while (ifread(handle, ((char *) &data) + 1, sizeof(data) - 4) == sizeof(data) - 4)
257 {
258 data.refBase = translateBase[data.refBase];
259
260 for (int i = 0; i < 3; i++)
261 likelihoods[i] = bQualityConvertor.toDouble(data.indel.lk[i]);
262
263 position = position + data.offset;
264
265 indelSequence[0].SetLength(abs(data.indel.length[0]) + 1);
266 indelSequence[0][abs(data.indel.length[0])] = 0;
267 if (ifread(handle, indelSequence[0].LockBuffer(), abs(data.indel.length[0])) != (unsigned int) abs(data.indel.length[0]))
268 break;
269
270 indelSequence[1].SetLength(abs(data.indel.length[1]) + 1);
271 indelSequence[1][abs(data.indel.length[1])] = 0;
272 if (ifread(handle, indelSequence[1].LockBuffer(), abs(data.indel.length[1])) != (unsigned int) abs(data.indel.length[1]))
273 break;
274
275 return true;
276 }
277
278 // Premature end of file
279 data.recordType = 0;
280 position = maxPosition + 1;
281 return false;
282 }
283
284 return false;
285}
286
287glfEntry & glfEntry::operator = (glfEntry & rhs)
288{
289 refBase = rhs.refBase;
290 recordType = rhs.recordType;
291 offset = rhs.offset;
293
294 for (int i = 0; i < 10; i++)
295 lk[i] = rhs.lk[i];
296
297 minLLK = rhs.minLLK;
298 depth = rhs.depth;
299
300 return * this;
301}
302
303const double * glfHandler::GetLikelihoods(int pos)
304{
305 if (pos == position)
306 return likelihoods;
307
308 return nullLikelihoods;
309}
310
311const unsigned char * glfHandler::GetLogLikelihoods(int pos)
312{
313 if (pos == position)
314 return data.lk;
315
316 return nullLogLikelihoods;
317}
318
319char glfHandler::GetReference(int pos, char defaultBase)
320{
321 if (pos == position)
322 return data.refBase;
323
324 return defaultBase;
325}
326
327int glfHandler::GetDepth(int pos)
328{
329 if (pos == position)
330 return data.depth;
331
332 return 0;
333}
334
335int glfHandler::GetMapQuality(int pos)
336{
337 if (pos == position)
338 return data.mapQuality;
339
340 return 0;
341}
342
343void glfHandler::WriteHeader(const String & headerText)
344{
345 char magicNumber[4] = {'G', 'L', 'F', 3};
346
347 ifwrite(handle, magicNumber, 4);
348
349 unsigned int headerLength = headerText.Length();
350
351 ifwrite(handle, &headerLength, 4);
352 ifwrite(handle, (void *)(const char *) headerText, headerLength);
353}
354
355void glfHandler::BeginSection(const String & sectionLabel, int sectionLength)
356{
357 int labelLength = sectionLabel.Length() + 1;
358
359 ifwrite(handle, &labelLength, sizeof(int));
360 ifwrite(handle, (void *)(const char *) sectionLabel, labelLength);
361 ifwrite(handle, &sectionLength, sizeof(int));
362
363 label = sectionLabel;
364 maxPosition = sectionLength;
365}
366
367void glfHandler::EndSection()
368{
369 char marker = 0;
370
371 ifwrite(handle, &marker, sizeof(char));
372}
373
374void glfHandler::WriteEntry(int outputPosition)
375{
376 data.offset = outputPosition - position;
377 position = outputPosition;
378
379 switch (data.recordType)
380 {
381 case 0 :
382 EndSection();
383 return;
384 case 1 :
385 data.refBase = backTranslateBase[data.refBase];
386 ifwrite(handle, &data, sizeof(data));
387 data.refBase = translateBase[data.refBase];
388 return;
389 case 2 :
390 data.refBase = backTranslateBase[data.refBase];
391 ifwrite(handle, &data, sizeof(data) - 3);
392 data.refBase = translateBase[data.refBase];
393
394 ifwrite(handle, (void *)(const char *) indelSequence[0], abs(data.indel.length[0]));
395 ifwrite(handle, (void *)(const char *) indelSequence[1], abs(data.indel.length[1]));
396 return;
397 }
398}
399
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654
void ifrewind(IFILE file)
Reset to the beginning of the file (cannot be done for stdin/stdout).
Definition: InputFile.h:642
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
unsigned int ifread(IFILE file, void *buffer, unsigned int size)
Read up to size bytes from the file into the buffer.
Definition: InputFile.h:600
unsigned int ifwrite(IFILE file, const void *buffer, unsigned int size)
Write the specified number of bytes from the specified buffer into the file.
Definition: InputFile.h:669
@ BGZF
bgzf file.
Definition: InputFile.h:48
unsigned int offset
offset of this record from the previous one, in bases
Definition: glfHandler.h:48
unsigned char mapQuality
root mean squared maximum mapping quality for overlapping reads
Definition: glfHandler.h:54
unsigned depth
log10 minimum likelihood * 10 and the number of mapped reads
Definition: glfHandler.h:51
unsigned char lk[10]
log10 likelihood ratio * 10 for genotypes AA, AC, AG, AT, CC, CG, CT, GG, GT, TT
Definition: glfHandler.h:59
unsigned char refBase
"XACMGRSVTWYHKDBN"[ref_base] gives the reference base
Definition: glfHandler.h:45