-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathciff_lin.h
370 lines (332 loc) · 9.53 KB
/
ciff_lin.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
/*
CIFF_LIN.H
----------
Copyright (c) 2019 Andrew Trotman
Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
*/
/*!
@file
@author Andrew Trotman
@copyright 2019 Andrew Trotman
@brief Reader for Jimmy Lin's shared index format.
@details Jimmy uses Anserini to index and then exports using Google protocol buffers. The
protocol buffer format is specified by:
@code{.unparsed}
syntax = "proto3";
package io.anserini.cidxf;
message Posting {
int32 docid = 1;
int32 tf = 2;
}
message PostingsList {
string term = 1;
int64 df = 2;
int64 cf = 3;
repeated Posting posting = 4;
}
@endcode
Where each PostingsList is written using writeDelimitedTo() and so each postings list is prefixed by a length integer.
This code provides an iterator over a file of this format (once read into memory)
For details of the encoding see: https://developers.google.com/protocol-buffers/docs/encoding
*/
#pragma once
#include <stdint.h>
#include <vector>
#include "protobuf.h"
namespace JASS
{
/*
CLASS CIFF_LIN
--------------
*/
/*!
@brief Reader for Jimmy Lin's shared index format.
@details Jimmy uses Anserini to index and then exports using Google protocol buffers. The
protocol buffer format is specified by:
@code{.unparsed}
syntax = "proto3";
package io.anserini.cidxf;
message Posting {
int32 docid = 1;
int32 tf = 2;
}
message PostingsList {
string term = 1;
int64 df = 2;
int64 cf = 3;
repeated Posting posting = 4;
}
@endcode
Where each PostingsList is written using writeDelimitedTo() and so each postings list is prefixed by a length integer.
This code provides an iterator over a file of this format (once read into memory)
For details of the encoding see: https://developers.google.com/protocol-buffers/docs/encoding
*/
class ciff_lin
{
public:
/*!
@enum error_code
@brief success or failure.
*/
enum error_code
{
OK = 0, ///< Method completed successfully
FAIL = 1 ///< Method did not completed successfully
} ;
/*
CLASS CIFF_LIN::POSTINGS_LIST
-----------------------------
*/
/*!
@brief A postings list with a term, df, cf, and postings list of <d,tf> pairs.
*/
class postings_list
{
public:
/*
CLASS CIFF_LIN::POSTING
-----------------------
*/
/*!
@brief A <docid,tf> tuple.
*/
class posting
{
public:
uint32_t docid; ///< The Document identifier d1-gap encoded.
uint32_t term_frequency; ///< The number of times the term occurs in document with id docid.
public:
posting() :
docid(0),
term_frequency(0)
{
/*
Nothing
*/
}
} ;
public:
protobuf::slice term; ///< The term as a <pointer,length> tuple.
uint64_t document_frequency; ///< The number of documents containing the term.
uint64_t collection_frequency; ///< The number of times the term occurs in the collection.
std::vector<posting> postings; ///< The postings list, a vector of <d,tf> tuples
private:
/*
CIFF_LIN::POSTINGS_LIST::GET_NEXT_POSTINGS_PAIR()
-------------------------------------------------
*/
/*!
@brief read a <d,tf> pair from a protobuf encoded stream
@param into [out] The posting pair being read
@param stream [in, out] A reference to a pointer to the start of the value, points to the next byte on return.
@param stream_end [in] A pointer to the byte after the last byte in the stream.
@return OK on success, FAIL on error (unknown field number)
*/
static error_code get_next_postings_pair(postings_list::posting &into, const uint8_t *&stream, const uint8_t *stream_end)
{
while (stream < stream_end)
{
protobuf::wire_type type;
uint8_t field = protobuf::get_type_and_field(type, stream);
if (type == protobuf::VARINT)
{
uint64_t value = protobuf::get_uint64_t(stream);
if (field == 1)
into.docid = static_cast<uint32_t>(value);
else if (field == 2)
into.term_frequency = static_cast<uint32_t>(value);
else
return FAIL;
}
else
return FAIL;
}
return OK;
}
public:
/*
CIFF_LIN::POSTINGS_LIST::GET_NEXT_POSTINGS()
--------------------------------------------
*/
/*!
@brief read a full postings list (inclding term, df, cf, etc.) from a protobuf encoded stream
@param into [out] The postings list being read
@param stream [in, out] A reference to a pointer to the start of the value, points to the next byte on return.
@param stream_end [in] A pointer to the byte after the last byte in the stream.
@return OK on success, FAIL on error (unknown field number)
*/
static error_code get_next_postings(postings_list &into, const uint8_t *&stream, const uint8_t *stream_end)
{
while (stream < stream_end)
{
protobuf::wire_type type;
uint8_t field = protobuf::get_type_and_field(type, stream);
if (type == protobuf::VARINT)
{
uint64_t value = protobuf::get_uint64_t(stream);
if (field == 2)
into.document_frequency = value;
else if (field == 3)
into.collection_frequency = value;
else
return FAIL;
}
else if (type == protobuf::BLOB)
{
protobuf::slice term = protobuf::get_blob(stream);
if (field == 1)
into.term = term;
else if (field == 4)
{
const uint8_t *here = term.start;
postings_list::posting d_tf_pair;
if (get_next_postings_pair(d_tf_pair, here, here + term.length) == FAIL)
return FAIL;
into.postings.push_back(d_tf_pair);
}
}
else
return FAIL;
}
return OK;
}
/*
CIFF_LIN::POSTINGS_LIST::CLEAR()
--------------------------------
*/
/*!
@brief removes all content from the postings list
*/
void clear(void)
{
term.clear();
document_frequency = 0;
collection_frequency = 0;
postings.clear();
}
} ;
private:
/*
CLASS CIFF_LIN::ITERATOR
------------------------
*/
/*!
@brief iterator class for iterating over an index
*/
class iterator
{
private:
ciff_lin &source; ///< The ciff_lin being iterated over
const uint8_t *stream; ///< Where in the source stream we are reading from
size_t where; ///< Are we the start or end of the data
postings_list postings; ///< The postings list we just made
public:
/*
CIFF_LIN::ITERATOR::ITERATOR()
------------------------------
*/
/*!
@brief Constructor
@param of [in] The ciff_lin object we iterate over
@where [in] Either 0 for start() or file_size for end()
*/
iterator(ciff_lin &of, size_t where) :
source(of),
stream(source.source_file),
where(where)
{
/* Nothing */
}
/*
CIFF_LIN::ITERATOR::OPERATOR++()
--------------------------------
*/
/*!
@brief Move on to the next postings list by constructing and storing the current one. On error move to the end of the stream and mark it as bad
@return A reference to this iterator
*/
const iterator &operator++()
{
int64_t postings_list_length = protobuf::get_uint64_t(stream);
postings.clear();
if (postings_list::get_next_postings(postings, stream, stream + postings_list_length) == FAIL)
{
/*
On error move to the end of the stream and mark the stream as bad
*/
stream = source.source_file + source.source_file_length;
source.status = FAIL;
}
return *this;
}
/*
CIFF_LIN::ITERATOR::OPERATOR*()
-------------------------------
*/
/*!
@brief Return the most recently constructed postings list
@return A reference to the current postings list
*/
const postings_list &operator*() const
{
return postings;
}
/*
CIFF_LIN::ITERATOR::OPERATOR!=()
--------------------------------
*/
/*!
@brief Compare two iterators.
@return True if the iterators are different
*/
bool operator!=(const iterator &with) const
{
return stream + where != with.stream + with.where;
}
};
private:
const uint8_t *source_file; ///< The CIFF file in memory
size_t source_file_length; ///< The length of the CIFF file in bytes
public:
error_code status; ///< OK or FAIL (FAIL only on error in input stream)
public:
/*
CIFF_LIN::CIFF_LIN()
--------------------
*/
/*!
@brief Constructor
@param source_file [in] a Pointer to the protobuf file once already read into memory.
@param source_file_length [in] The length (in bytes) of the source file once in memory.
*/
ciff_lin(const uint8_t *source_file, size_t source_file_length) :
source_file(source_file),
source_file_length(source_file_length),
status(OK)
{
/* Nothing */
}
/*
CIFF_LIN::BEGIN()
-----------------
*/
/*!
@brief Return and iterator to the start of this object
*/
const iterator begin()
{
return ++iterator(*this, 0);
}
/*
CIFF_LIN::END()
---------------
*/
/*!
@brief Return and iterator to the end of this object
*/
const iterator end()
{
return iterator(*this, source_file_length);
}
} ;
}