-
Notifications
You must be signed in to change notification settings - Fork 634
/
Copy pathGBTokenizer.m
503 lines (427 loc) · 19.5 KB
/
GBTokenizer.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
//
// GBTokenizer.m
// appledoc
//
// Created by Tomaz Kragelj on 25.7.10.
// Copyright (C) 2010, Gentle Bytes. All rights reserved.
//
#import <RegexKitLite/RegexKitLite.h>
#import "PKToken+GBToken.h"
#import "GBApplicationSettingsProvider.h"
#import "GBSourceInfo.h"
#import "GBComment.h"
#import "GBTokenizer.h"
@interface GBTokenizer ()
- (BOOL)consumeComments;
- (NSString *)commentValueFromString:(NSString *)value isMultiline:(BOOL)multiline;
- (NSString *)lineByPreprocessingHeaderDocDirectives:(NSString *)line;
- (NSArray *)linesByReorderingHeaderDocDirectives:(NSArray *)lines;
- (NSArray *)allTokensFromTokenizer:(PKTokenizer *)tokenizer;
- (NSUInteger)offsetOfLineContainingOffset:(NSUInteger)offset;
- (NSInteger)indentationAtOffset:(NSUInteger)offset;
@property (strong) NSString *filename;
@property (strong) NSString *input;
@property (strong) NSArray *tokens;
@property (assign) NSUInteger tokenIndex;
@property (assign) BOOL isLastCommentMultiline;
@property (assign) BOOL isPreviousCommentMultiline;
@property (strong) NSMutableString *lastCommentBuilder;
@property (strong) NSMutableString *previousCommentBuilder;
@property (strong) PKToken *lastCommentToken;
@property (strong) PKToken *previousCommentToken;
@property (strong) NSString *singleLineCommentAfterRegex;
@property (strong) NSString *singleLineCommentRegex;
@property (strong) NSString *multiLineCommentRegex;
@property (strong) NSString *commentDelimiterRegex;
@property (strong) GBApplicationSettingsProvider *settings;
@end
#pragma mark -
@implementation GBTokenizer
#pragma mark Initialization & disposal
+ (id)tokenizerWithSource:(PKTokenizer *)tokenizer filename:(NSString *)filename {
return [self tokenizerWithSource:tokenizer filename:filename settings:nil];
}
+ (id)tokenizerWithSource:(PKTokenizer *)tokenizer filename:(NSString *)filename settings:(id)settings {
return [[self alloc] initWithSourceTokenizer:tokenizer filename:filename settings:settings];
}
- (id)initWithSourceTokenizer:(PKTokenizer *)tokenizer filename:(NSString *)aFilename settings:(id)theSettings {
NSParameterAssert(tokenizer != nil);
NSParameterAssert(aFilename != nil);
NSParameterAssert([aFilename length] > 0);
GBLogDebug(@"Initializing tokenizer...");
self = [super init];
if (self) {
self.settings = theSettings;
self.singleLineCommentAfterRegex = @"(?m-s:\\s*///<(.*)$)";
self.singleLineCommentRegex = @"(?m-s:\\s*///([^<].*)$)";
self.multiLineCommentRegex = @"(?s:/\\*[*!](.*)\\*/)";
self.commentDelimiterRegex = @"^[!@#$%^&*()_=+`~,<.>/?;:'\"-]{3,}$";
self.tokenIndex = 0;
self.lastCommentBuilder = [NSMutableString string];
self.previousCommentBuilder = [NSMutableString string];
self.filename = aFilename;
self.input = tokenizer.string;
self.tokens = [self allTokensFromTokenizer:tokenizer];
[self consumeComments];
}
return self;
}
#pragma mark Tokenizing handling
- (PKToken *)lookahead:(NSUInteger)offset {
NSUInteger delta = 0;
NSUInteger counter = 0;
while (counter <= offset) {
NSUInteger index = self.tokenIndex + delta;
if (index >= [self.tokens count]) return [PKToken EOFToken];
if ([self.tokens[index] isComment]) {
delta++;
continue;
}
delta++;
counter++;
}
return self.tokens[self.tokenIndex + delta - 1];
}
- (void)lookaheadTo:(NSString *)end usingBlock:(void (^)(PKToken *token, BOOL *stop))block {
NSUInteger tokenCount = [self.tokens count];
BOOL quit = NO;
for (NSUInteger index = self.tokenIndex; index < tokenCount; ++index) {
PKToken *token = self.tokens[index];
if ([token isComment]) {
index++;
continue;
}
if ([token matches:end]) {
break;
}
block(token, &quit);
if (quit) break;
}
}
- (PKToken *)currentToken {
if ([self eof]) return [PKToken EOFToken];
return self.tokens[self.tokenIndex];
}
- (GBComment *)postfixCommentFrom:(PKToken *)startToken
{
NSString *postfixValue = nil;
BOOL isMultiline = NO;
if (self.tokenIndex > 0) {
NSUInteger pos = self.tokenIndex;
PKToken *token = nil;
do {
if (pos < self.tokens.count) {
token = self.tokens[pos];
NSArray *postfixLines = [[token stringValue] componentsMatchedByRegex:self.singleLineCommentAfterRegex capture:1];
if ([postfixLines count] > 0) {
NSString *value = [NSString string];
for (NSString *match in postfixLines) value = [value stringByAppendingString:match];
if (postfixValue) {
postfixValue = [@"\n" stringByAppendingString:postfixValue];
postfixValue = [value stringByAppendingString:postfixValue];
isMultiline = YES;
}
else postfixValue = value;
}
}
--pos;
} while (pos && startToken != token);
}
if (!postfixValue) return nil;
postfixValue = [self commentValueFromString:postfixValue isMultiline:isMultiline];
GBSourceInfo *sourceInfo = [self sourceInfoForToken:startToken];
return [GBComment commentWithStringValue:postfixValue sourceInfo:sourceInfo];
}
- (void)consume:(NSUInteger)count {
if (count == 0) return;
while (count > 0 && ![self eof]) {
self.tokenIndex++;
[self consumeComments];
count--;
}
}
- (void)consumeTo:(NSString *)end usingBlock:(void (^)(PKToken *token, BOOL *consume, BOOL *stop))block {
[self consumeFrom:nil to:end usingBlock:block];
}
- (void)consumeFrom:(NSString *)start to:(NSString *)end usingBlock:(void (^)(PKToken *token, BOOL *consume, BOOL *stop))block {
// Skip starting token.
if (start) {
if (![[self currentToken] matches:start]) return;
[self consume:1];
}
// Report all tokens until EOF or ending token is found.
NSUInteger level = 1;
BOOL quit = NO;
while (![self eof]) {
// Handle multiple hierarchy.
if (start && [[self currentToken] matches:start]) level++;
if ([[self currentToken] matches:end]) {
if (!start) break;
if (--level == 0) break;
}
// Report the token.
BOOL consume = YES;
if (block) block([self currentToken], &consume, &quit);
if (consume) [self consume:1];
if (quit) break;
}
// Skip ending token if found.
if ([[self currentToken] matches:end]) [self consume:1];
}
- (BOOL)eof {
return (self.tokenIndex >= [self.tokens count]);
}
#pragma mark Token information handling
- (GBSourceInfo *)sourceInfoForCurrentToken {
return [self sourceInfoForToken:[self currentToken]];
}
- (GBSourceInfo *)sourceInfoForToken:(PKToken *)token {
NSParameterAssert(token != nil);
NSUInteger lines = [self.input numberOfLinesInRange:NSMakeRange(0, [token offset])];
return [GBSourceInfo infoWithFilename:self.filename lineNumber:lines];
}
#pragma mark Comments handling
- (BOOL)consumeComments {
// This method checks if current token is a comment and consumes all comments until non-comment token is detected or EOF reached. The result of the method is that current index is positioned on the first non-comment token. If current token is not comment, the method doesn't do anything, but simply returns NO to indicate it didn't find a comment and therefore it didn't move current token. This is also where we do initial comments handling such as removing starting and ending chars etc.
if ([self eof]) return NO;
//PKToken *startingPreviousToken = nil;
//PKToken *startingLastToken = nil;
NSUInteger previousSingleLineEndOffset = NSNotFound;
NSInteger previousSingleLineIndentation = -1;
while (![self eof] && [[self currentToken] isComment]) {
PKToken *token = [self currentToken];
NSString *value = nil;
// Match single line comments. Note that we can simplify the code with assumption that there's only one single line comment per match. If regex finds more (should never happen though), we simply combine them together. Then we check if the comment is a continuation of previous single liner by testing the string offset and indentation. If so we group the values together, otherwise we create a new single line comment. Finally we remember current comment offset to allow grouping of next single line comment.
NSArray *singleLiners = [[token stringValue] componentsMatchedByRegex:self.singleLineCommentRegex capture:1];
if ([singleLiners count] > 0) {
value = [NSString string];
for (NSString *match in singleLiners) value = [value stringByAppendingString:match];
NSInteger tokenIndentation = [self indentationAtOffset:[token offset]];
BOOL isContinuingPreviousSingleLiner = ([token offset] == previousSingleLineEndOffset + 1);
if (!isContinuingPreviousSingleLiner && previousSingleLineIndentation > 0 && tokenIndentation == previousSingleLineIndentation) {
isContinuingPreviousSingleLiner = ([token offset] == previousSingleLineEndOffset + previousSingleLineIndentation + 1);
}
if (isContinuingPreviousSingleLiner) {
[self.lastCommentBuilder appendString:@"\n"];
} else {
[self.previousCommentBuilder setString:self.lastCommentBuilder];
//startingPreviousToken = startingLastToken;
[self.lastCommentBuilder setString:@""];
self.isPreviousCommentMultiline = self.isLastCommentMultiline;
self.previousCommentToken = self.lastCommentToken;
self.isLastCommentMultiline = NO;
//startingLastToken = token;
self.lastCommentToken = token;
}
previousSingleLineEndOffset = [token offset] + [[token stringValue] length];
previousSingleLineIndentation = tokenIndentation;
}
// Match multiple line comments and only process last (in reality we should only have one comment in each mutliline comment token, but let's handle any strange cases graceosly).
else {
NSArray *multiLiners = [[token stringValue] componentsMatchedByRegex:self.multiLineCommentRegex capture:1];
value = [multiLiners lastObject];
[self.previousCommentBuilder setString:self.lastCommentBuilder];
//startingPreviousToken = startingLastToken;
[self.lastCommentBuilder setString:@""];
self.isPreviousCommentMultiline = self.isLastCommentMultiline;
self.previousCommentToken = self.lastCommentToken;
self.isLastCommentMultiline = YES;
//startingLastToken = token;
self.lastCommentToken = token;
}
// Append string value to current comment and proceed with next token.
if (value)
[self.lastCommentBuilder appendString:value];
self.tokenIndex++;
}
// If last comment contains @name, we should assign it to previous one and reset current! This should ideally be handled by higher level component, but it's simplest to do it here. Note that we don't deal with source info here, we'll do immediately after this as long as we properly setup tokens.
if (self.settings && [self.lastCommentBuilder isMatchedByRegex:self.settings.commentComponents.methodGroupRegex]) {
self.previousCommentBuilder = [self.lastCommentBuilder mutableCopy];
[self.lastCommentBuilder setString:@""];
//startingPreviousToken = startingLastToken;
//startingLastToken = nil;
self.previousCommentToken = self.lastCommentToken;
self.lastCommentToken = nil;
}
return YES;
}
- (NSString *)commentValueFromString:(NSString *)value isMultiline:(BOOL)multiline {
if ([value length] == 0) return nil;
NSArray *lines = [value componentsSeparatedByCharactersInSet:[NSCharacterSet newlineCharacterSet]];
NSMutableArray *strippedLines = [NSMutableArray arrayWithCapacity:[lines count]];
// First pass: removes delimiters. We simply detect 3+ delimiter chars in any combination. If removing delimiter yields empty line, discard it.
[lines enumerateObjectsUsingBlock:^(NSString *line, NSUInteger idx, BOOL *stop) {
NSString *stripped = [line stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceCharacterSet]];
NSString *delimited = [stripped stringByReplacingOccurrencesOfRegex:self.commentDelimiterRegex withString:@""];
if ([stripped length] > [delimited length]) {
if ([delimited length] > 0) [strippedLines addObject:delimited];
return;
}
[strippedLines addObject:line];
}];
// If all lines start with a *, ignore the prefix. Note that we ignore first line as it can only contain /** and text! We also ignore last line as if it only contains */
NSString *prefixRegex = @"(?m:^\\s*\\*[ ]*)";
__block BOOL stripPrefix = ([strippedLines count] > 1);
if (stripPrefix) {
[strippedLines enumerateObjectsUsingBlock:^(NSString *line, NSUInteger idx, BOOL *stop) {
NSString *stripped = [line stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceCharacterSet]];
if (idx == [strippedLines count]-1 && [stripped length] == 0) {
return;
}
if ((!multiline || idx > 0) && ![stripped isMatchedByRegex:prefixRegex]) {
stripPrefix = NO;
*stop = YES;
}
}];
}
// Preprocess header doc directives.
NSArray *preprocessedLines = [self linesByReorderingHeaderDocDirectives:strippedLines];
// Finally remove common line prefix and a single prefix space (but leave multiple spaces to properly handle space prefixed example blocks!) and compose all objects into final comment.
NSCharacterSet *spacesSet = [NSCharacterSet characterSetWithCharactersInString:@" "];
NSString *spacesPrefixRegex = @"^ {2,}";
NSString *tabPrefixRegex = @"^\t";
NSMutableString *result = [NSMutableString stringWithCapacity:[value length]];
[preprocessedLines enumerateObjectsUsingBlock:^(NSString *line, NSUInteger idx, BOOL *stop) {
if (stripPrefix) line = [line stringByReplacingOccurrencesOfRegex:prefixRegex withString:@""];
if (![line isMatchedByRegex:spacesPrefixRegex] && ![line isMatchedByRegex:tabPrefixRegex]) line = [line stringByTrimmingCharactersInSet:spacesSet];
line = [self lineByPreprocessingHeaderDocDirectives:line];
[result appendString:line];
if (idx < [strippedLines count] - 1) [result appendString:@"\n"];
}];
// If the result is empty string, return nil, otherwise return the comment string.
if ([result length] == 0) return nil;
return result;
}
- (NSString *)lineByPreprocessingHeaderDocDirectives:(NSString *)line {
if (!self.settings.preprocessHeaderDoc) return line;
// Remove the entire line when it contains @method or property or class.
//line = [line stringByReplacingOccurrencesOfRegex:@"(?m:@(protocol|method|property|class).*$)" withString:@""];
// Remove unsupported headerDoc words.
//line = [line stringByReplacingOccurrencesOfRegex:@"(?m:^\\s*@(discussion|abstract))\\s?" withString:@"\n"];
// Replace methodgroup with name.
line = [line stringByReplacingOccurrencesOfRegex:@"(?:@(methodgroup|group))" withString:@"@name"];
// Remove unsupported Doxygen words. This should ease the pain of migrating large amount of comments using doxygen markup.
// Comments like the following are cleaned up, and made ready for the markup appledoc expects
/**
@brief Brief Comment
@details Detailed Comment.
*/
// Becomes....
/**
Brief Comment
Detailed Comment.
*/
line = [line stringByReplacingOccurrencesOfRegex:@"(?m:^\\s*@updated).*$?" withString:@"\n"];
// Removes any occurance of @brief and it's surrounding whitespace
//line = [line stringByReplacingOccurrencesOfRegex:@"\\s*@brief\\s*" withString:@""];
// Replaces any occurance of @details and it's surrounding whitespace with a newline
//line = [line stringByReplacingOccurrencesOfRegex:@"^\\s*@details\\s*" withString:@"\n"];
return line;
}
- (NSArray *)linesByReorderingHeaderDocDirectives:(NSArray *)lines {
#if __MAC_OS_X_VERSION_MIN_REQUIRED >= 1070
if (!self.settings.preprocessHeaderDoc) return lines;
// Make sure that @param and @return is placed at the end (after abstract etc.)
NSMutableArray *reorderedParams = [NSMutableArray array];
NSMutableArray *reorderedNonParams = [NSMutableArray array];
NSRegularExpression *directiveExpression = [NSRegularExpression regularExpressionWithPattern:@"^\\s*@(param|result|return)" options:NSRegularExpressionDotMatchesLineSeparators error:nil];
NSRegularExpression *lineExpression = [NSRegularExpression regularExpressionWithPattern:@"^\\s*@[a-z]" options:NSRegularExpressionDotMatchesLineSeparators error:nil];
BOOL isParamBlock = NO;
for (NSString *line in lines) {
if ([directiveExpression numberOfMatchesInString:line options:0 range:NSMakeRange(0, [line length])] > 0) {
isParamBlock = YES;
} else if ([lineExpression numberOfMatchesInString:line options:0 range:NSMakeRange(0, [line length])] > 0) {
isParamBlock = NO;
}
if (isParamBlock) {
[reorderedParams addObject:line];
} else {
[reorderedNonParams addObject:line];
}
}
[reorderedNonParams addObjectsFromArray:reorderedParams];
return reorderedNonParams;
#else
return lines;
#endif
}
- (void)resetComments {
GBLogDebug(@"Resetting comments...");
[self.lastCommentBuilder setString:@""];
[self.previousCommentBuilder setString:@""];
}
- (GBComment *)lastComment {
if ([self.lastCommentBuilder length] == 0) return nil;
NSString *value = [self commentValueFromString:self.lastCommentBuilder isMultiline:self.isLastCommentMultiline];
GBSourceInfo *sourceInfo = [self sourceInfoForToken:self.lastCommentToken];
return [GBComment commentWithStringValue:value sourceInfo:sourceInfo];
}
- (GBComment *)previousComment {
if ([self.previousCommentBuilder length] == 0) return nil;
NSString *value = [self commentValueFromString:self.previousCommentBuilder isMultiline:self.isPreviousCommentMultiline];
GBSourceInfo *sourceInfo = [self sourceInfoForToken:self.previousCommentToken];
return [GBComment commentWithStringValue:value sourceInfo:sourceInfo];
}
#pragma mark Helper methods
- (NSArray *)allTokensFromTokenizer:(PKTokenizer *)tokenizer {
// Return all appledoc comments too, but ignore ordinary C comments!
BOOL reportsComments = tokenizer.commentState.reportsCommentTokens;
tokenizer.commentState.reportsCommentTokens = YES;
NSMutableArray *result = [NSMutableArray array];
PKToken *token;
while ((token = [tokenizer nextToken]) != [PKToken EOFToken]) {
if ([token isComment] && ![token isAppledocComment]) continue;
[result addObject:token];
}
tokenizer.commentState.reportsCommentTokens = reportsComments;
return result;
}
- (NSUInteger)offsetOfLineContainingOffset:(NSUInteger)offset {
// This method returns the offset of the first character in the line
// containing the character at the specific offset.
NSRange newlineRange = [self.input rangeOfCharacterFromSet:[NSCharacterSet newlineCharacterSet]
options:NSBackwardsSearch
range:NSMakeRange(0, offset)];
if (newlineRange.location != NSNotFound) {
return newlineRange.location + 1;
}
// First line
return 0;
}
- (NSInteger)indentationAtOffset:(NSUInteger)offset {
// This method returns the number of tab or space characters preceding the
// offset if and only if it is only preceded by such indentation characters,
// otherwise returns -1.
NSUInteger lineOffset = [self offsetOfLineContainingOffset:offset];
NSRange lineToOffsetRange = NSMakeRange(lineOffset, offset - lineOffset);
// Short-circuit logic if offset is at the start of the line
if (lineToOffsetRange.length == 0) {
return 0;
}
NSCharacterSet * nonWhitespace = [[NSCharacterSet whitespaceCharacterSet] invertedSet];
NSRange nonWhitespaceRange = [self.input rangeOfCharacterFromSet:nonWhitespace
options:0
range:lineToOffsetRange];
// Line contains only whitespace preceding the offset: indentation
if (nonWhitespaceRange.location == NSNotFound) {
return lineToOffsetRange.length;
}
return -1;
}
#pragma mark Properties
@synthesize filename;
@synthesize input;
@synthesize tokens;
@synthesize tokenIndex;
@synthesize lastComment;
@synthesize lastCommentToken;
@synthesize previousComment;
@synthesize previousCommentBuilder;
@synthesize previousCommentToken;
@synthesize isLastCommentMultiline;
@synthesize isPreviousCommentMultiline;
@synthesize singleLineCommentRegex;
@synthesize multiLineCommentRegex;
@synthesize commentDelimiterRegex;
@synthesize settings;
@end