Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit b139576

Browse files
More experiments with generating input data for syntax tree parsers
1 parent 2f0edae commit b139576

File tree

3 files changed

+240
-1
lines changed

3 files changed

+240
-1
lines changed

blobstamper/galley.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,19 @@ GalleySetBase::extract_internal(Blob &blob)
347347
return res;
348348
}
349349

350+
void
351+
GalleySetBase::LoadAll(Blob &blob)
352+
{
353+
std::vector<Blob> blobs = extract_internal(blob);
354+
for(int i=0; i<blobs.size(); i++)
355+
{
356+
Blob blob = blobs[i];
357+
StampBase & stamp = stamps[i];
358+
stamp.Load(blob);
359+
}
360+
}
361+
362+
350363
std::vector<std::string>
351364
GalleySetStr::ExtractStrSet(Blob &blob)
352365
{
@@ -356,7 +369,7 @@ GalleySetStr::ExtractStrSet(Blob &blob)
356369
{
357370
Blob blob = blobs[i];
358371
StampBaseStr & stamp = s_stamps[i];
359-
std::string str= stamp.ExtractStr(blob);
372+
std::string str = stamp.ExtractStr(blob);
360373
res.push_back(str);
361374
}
362375
return res;

blobstamper/galley.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ class GalleySetBase : public GalleyBase
9999
public:
100100
GalleySetBase(std::vector<std::reference_wrapper<StampBase>> arg) : stamps(arg) {};
101101
std::vector<Blob> extract_internal(Blob &blob);
102+
void LoadAll(Blob &blob);
102103

103104
int minSize() override;
104105
int maxSize() override;

examples/exampleZZ.cpp

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
#include<stdio.h>
2+
#include<string.h>
3+
4+
#include<string>
5+
#include<iostream>
6+
#include<cstdlib>
7+
8+
#include<blobstamper/blobstamper.h>
9+
10+
namespace std
11+
{
12+
template<class T> using ref_vector = vector<reference_wrapper<T>>;
13+
}
14+
15+
template<class StampT> class StampLottery: public StampT
16+
{
17+
protected:
18+
std::ref_vector<StampT> stamps;
19+
int oracle_size;
20+
int init_oracle_size(std::ref_vector<StampT> stamps_arg);
21+
22+
int stored_min;
23+
int init_stored_min(std::ref_vector<StampT> stamps_arg);
24+
25+
public:
26+
StampLottery(std::ref_vector<StampT> stamps_arg): stamps(stamps_arg), oracle_size(init_oracle_size(stamps_arg)), stored_min(init_stored_min(stamps_arg)) {};
27+
StampLottery(): stored_min(-1) {};
28+
29+
virtual int minSize() override;
30+
virtual int maxSize() override;
31+
virtual std::string ExtractStr(Blob &blob) override;
32+
void Append(StampT & stamp);
33+
};
34+
35+
36+
template<class StampT> int
37+
StampLottery<StampT>::
38+
init_stored_min(std::ref_vector<StampT> stamps_arg)
39+
{
40+
int min = std::numeric_limits<int>::max();
41+
42+
for(StampT & stamp : stamps)
43+
{
44+
45+
if (min > stamp.minSize())
46+
min = stamp.minSize();
47+
}
48+
return min;
49+
}
50+
51+
template<class StampT> int
52+
StampLottery<StampT>::init_oracle_size(std::ref_vector<StampT> stamps_arg)
53+
{
54+
unsigned long size = stamps_arg.size();
55+
if (size < std::numeric_limits<unsigned char>::max())
56+
return 1;
57+
if (size < std::numeric_limits<unsigned short int>::max())
58+
return 2;
59+
if (size < std::numeric_limits<unsigned int>::max())
60+
return 4;
61+
return 8;
62+
}
63+
64+
65+
template<class StampT> int
66+
StampLottery<StampT>::minSize()
67+
{
68+
return stored_min + oracle_size;
69+
}
70+
71+
template<class StampT> int
72+
StampLottery<StampT>::maxSize()
73+
{
74+
return -1; // FIXME this is true only for recurion case. Should fix it somehow if Lottery is used in other cases
75+
}
76+
77+
78+
template<class StampT> std::string
79+
StampLottery<StampT>::ExtractStr(Blob &blob)
80+
{
81+
unsigned long oracle;
82+
unsigned long oracle_max;
83+
84+
switch (oracle_size)
85+
{
86+
case 1:
87+
{
88+
StampArithm<unsigned char> stamp;
89+
oracle = stamp.ExtractValue(blob);
90+
oracle_max = std::numeric_limits<unsigned char>::max();
91+
break;
92+
}
93+
case 2:
94+
{
95+
StampArithm<unsigned short> stamp;
96+
oracle = stamp.ExtractValue(blob);
97+
oracle_max = std::numeric_limits<unsigned short>::max();
98+
break;
99+
}
100+
case 4:
101+
{
102+
StampArithm<unsigned int> stamp;
103+
oracle = stamp.ExtractValue(blob);
104+
oracle_max = std::numeric_limits<unsigned int>::max();
105+
break;
106+
}
107+
case 8:
108+
{
109+
StampArithm<unsigned long> stamp;
110+
oracle = stamp.ExtractValue(blob);
111+
oracle_max = std::numeric_limits<unsigned long>::max();
112+
break;
113+
}
114+
default:
115+
abort(); // Should never get here
116+
}
117+
118+
/* Actually we use only stamps that short enogh to consume blob's available data*/
119+
std::ref_vector<StampT> actual_stamps;
120+
for(StampT & stamp : stamps)
121+
{
122+
if(blob.Size() < stamp.minSize()) // Skip all stamps that dose not fit
123+
continue;
124+
if ( stamp.isUnbounded() || // Unbounded is always ok
125+
stamp.maxSize() > blob.Size() || // Variated that can consume all data is ok
126+
stamp.minSize() * 2 > blob.Size() // Fixed or variated stamp that lefts less data then it's min size will also do
127+
)
128+
{
129+
actual_stamps.push_back(stamp);
130+
}
131+
}
132+
if (actual_stamps.empty())
133+
{
134+
// Add just everything that fits
135+
for(StampT & stamp : stamps)
136+
{
137+
if(blob.Size() < stamp.minSize()) // Skip all stamps that dose not fit
138+
continue;
139+
actual_stamps.push_back(stamp);
140+
}
141+
}
142+
143+
if (actual_stamps.empty())
144+
throw OutOfData(); // This should not happen
145+
146+
long long index = ((double) oracle) / oracle_max * actual_stamps.size();
147+
if ( index == actual_stamps.size()) index--; /* If we hit the boundary step inside a bit*/
148+
149+
StampT& stamp = actual_stamps[index];
150+
return stamp.ExtractStr(blob);
151+
}
152+
153+
154+
template<class StampT> void
155+
StampLottery<StampT>::Append(StampT & stamp)
156+
{
157+
if (stamp.minSize()<stored_min)
158+
{
159+
stored_min = stamp.minSize();
160+
}
161+
stamps.push_back(stamp);
162+
oracle_size = init_oracle_size(stamps);
163+
}
164+
165+
166+
class BinaryOp: public StampBaseStr, public GalleySetBase
167+
{
168+
protected:
169+
std::string op_name;
170+
StampBaseStr &stamp1;
171+
StampBaseStr &stamp2;
172+
public:
173+
virtual std::string ExtractStr(Blob &blob) override;
174+
BinaryOp(std::string arg_op_name, StampBaseStr& arg_stamp1, StampBaseStr& arg_stamp2) :
175+
GalleySetBase({arg_stamp1, arg_stamp2}),
176+
op_name(arg_op_name),
177+
stamp1(arg_stamp1),
178+
stamp2(arg_stamp2) {};
179+
};
180+
181+
std::string
182+
BinaryOp::ExtractStr(Blob &blob)
183+
{
184+
std::vector<Blob> blobs = extract_internal(blob);
185+
return (std::string)"(" + stamp1.ExtractStr(blobs[0]) + " "+ op_name + " " + stamp2.ExtractStr(blobs[1]) + ")";
186+
}
187+
188+
std::vector<std::string> ops = {"+","-","*","/","^"};
189+
190+
int main()
191+
{
192+
// char data[] = "abcdef" "abcdef" "ABCDEF" "012345" "sdfaskdlfjalsfjdlasjfaksdjfgkwuergkwhfdaksjdfgaskuyrgfaelkrgfsaldjfgakyefgrkweugyfaksjskdfsd";
193+
194+
char data[] =
195+
"\x051\x04E\x05A\x018\x043\x00C\x039\x0DC\x069\x0AC\x009\x014\x05A\x0B2\x07F\x078\x021\x09F\x08B\x0B1\x07E\x060\x01F\x04A\x0D1\x071\x05C\x04F\x011\x0D0\x061\x0FB\x037\x077\x081\x00C\x059\x00A\x037\x02F\x061\x04A\x065\x06D"
196+
"\x003\x04A\x0BC\x099\x0F8\x00B\x0F7\x020\x0C9\x074\x065\x008\x0B4\x010\x008\x0B4\x08B\x070\x0E1\x0EF\x026\x04F\x0F9\x0AB\x01C\x06C\x035\x018\x086\x037\x0E7\x02F\x044\x057\x001\x020\x006\x0DD\x0C4\x059\x0D1\x0C5\x0A9\x005"
197+
"\x038\x078\x0E2\x053\x01D\x0F0\x06E\x0E6\x018\x0B6\x048\x0F1\x0DC\x061\x092\x0FB\x0D3\x010\x0B8\x042\x0CA\x0C1\x0E3\x075\x077\x099\x093\x0CC\x063\x0F0\x09E\x044\x03D\x070\x01A\x089\x035\x032\x04A\x0BD\x082\x0BF\x0EA\x002"
198+
"\x043\x071\x079\x0A0\x068\x0B3\x0D9\x029\x0E9\x045\x0A2\x027\x003\x02E\x0E2\x01F\x007\x0BD\x0CF\x00A\x03E\x00D\x044\x024\x0FA\x0DB\x03D\x033\x036\x011\x081\x070\x0B6\x04A\x083\x061\x05F\x0AE\x0F0\x0C5\x0A1\x010\x05B\x003"
199+
"\x061\x0C3\x0D2\x078\x0BD\x0F8\x0E1\x04B\x02F\x0D9\x093\x09F\x00E\x0D6\x03A\x070\x0F8\x052\x013\x0EE\x062\x0C0\x027\x0E5\x07B\x07B\x09E\x05D\x074\x068\x0C6\x0CD\x04E\x022\x03B\x04E\x0E7\x0E7\x0EE\x0EC\x015\x02C\x0FA\x050"
200+
"\x033\x042\x0E6\x0BF\x028\x002\x052\x096\x033\x057\x0D8\x082\x053\x06E\x0BD\x0C6\x0ED\x015\x036\x09E\x03B\x0BE\x0F3\x068\x0BD\x0EC\x0D3\x0E9\x023\x029\x081\x0CF\x0F8\x02D\x081\x049\x007\x0CC\x005\x004\x062\x040\x0E0\x0D0"
201+
"\x0CD\x062\x0D4\x09B\x007\x001\x037\x020\x059\x0AC\x0FC\x0A4\x095\x049\x05F\x04C\x0DA\x02B\x0E8\x0E9\x0BF\x029\x01F\x0D0\x06B\x06E\x0F5\x005\x075\x07B\x036\x0D2\x054\x078\x0D3\x059\x077\x09A\x0D5\x079\x0AC\x034\x030\x0FD"
202+
"\x006\x079\x022\x0F4\x0ED\x059\x080\x081\x08F\x0A6\x08F\x042\x08A\x0CC\x030\x019\x094\x0F3\x062\x00B\x08A\x0D4\x0F8\x0F3\x03B\x049\x0D1\x06D\x0C6\x067\x006\x0D3\x023\x035\x053\x0C1\x0F8\x068\x0EF\x0AD\x0C7\x053\x004\x02C"
203+
"\x092\x087\x075\x0B0\x0F0\x0F7\x0D9\x04C\x0C7\x0A2\x095\x02B\x038\x02E\x0F2\x005\x0BE\x0CD\x02E\x093\x08A\x088\x063\x07D\x0F1\x08A\x002\x0D0\x0B9\x05C\x008\x066\x002\x044\x0B0\x08F\x041\x009\x06F\x0E5\x08B\x068\x0EB\x05A";
204+
205+
Blob blob(data, strlen(data));
206+
207+
StampArithm<unsigned char> stampс;
208+
209+
DictLCAlphaSmall dict;
210+
StampDict stamp_dict(dict);
211+
212+
StampLottery<StampBaseStr> stamp_lot({stampс, stamp_dict});
213+
214+
for(std::string op_name : ops)
215+
{
216+
BinaryOp *stamp_bi = new BinaryOp(op_name, stamp_lot, stamp_lot);
217+
stamp_lot.Append(*stamp_bi);
218+
}
219+
220+
for(int i=stamp_lot.minSize(); i<=strlen(data);i++)
221+
{
222+
Blob blob2(data, i);
223+
std::cout << i << " " << stamp_lot.ExtractStr(blob2) <<"\n";
224+
}
225+
}

0 commit comments

Comments
 (0)