ポイント
それなりに汎用的な Flex みたいなものを作ったので、その部分は CSS 以外にも使えると思います。
あと、定義を文字列で書かずに正規表現オブジェクトで書くのでバックスラッシュをエスケープせずに書けます。ですので、ほとんどの箇所は WebKit の tokenizer の定義をコピーするだけで済みました。
その辺のアイデアは
JavaScript で構文解析: Days on the Moon
を参考にしました
あと
http://svn.coderepos.org/share/lang/actionscript/ascss/src/css/CSSLexer.as
id:gyuque さんの ASCSS もいっぱい参考にしました。
ソース
トークンの値は、 kmyacc で生成しています。
JSCSS = {}; JSCSS.tokens = {}; JSCSS.tokens.YYERRTOK = 256; JSCSS.tokens.UNIMPORTANT_TOK = 257; JSCSS.tokens.WHITESPACE = 258; JSCSS.tokens.SGML_CD = 259; JSCSS.tokens.INCLUDES = 260; JSCSS.tokens.DASHMATCH = 261; JSCSS.tokens.BEGINSWITH = 262; JSCSS.tokens.ENDSWITH = 263; JSCSS.tokens.CONTAINS = 264; JSCSS.tokens.STRING = 265; JSCSS.tokens.IDENT = 266; JSCSS.tokens.NTH = 267; JSCSS.tokens.HEX = 268; JSCSS.tokens.IDSEL = 269; JSCSS.tokens.IMPORT_SYM = 270; JSCSS.tokens.PAGE_SYM = 271; JSCSS.tokens.MEDIA_SYM = 272; JSCSS.tokens.FONT_FACE_SYM = 273; JSCSS.tokens.CHARSET_SYM = 274; JSCSS.tokens.NAMESPACE_SYM = 275; JSCSS.tokens.WEBKIT_RULE_SYM = 276; JSCSS.tokens.WEBKIT_DECLS_SYM = 277; JSCSS.tokens.WEBKIT_VALUE_SYM = 278; JSCSS.tokens.WEBKIT_MEDIAQUERY_SYM = 279; JSCSS.tokens.IMPORTANT_SYM = 280; JSCSS.tokens.MEDIA_ONLY = 281; JSCSS.tokens.MEDIA_NOT = 282; JSCSS.tokens.MEDIA_AND = 283; JSCSS.tokens.QEMS = 284; JSCSS.tokens.EMS = 285; JSCSS.tokens.EXS = 286; JSCSS.tokens.PXS = 287; JSCSS.tokens.CMS = 288; JSCSS.tokens.MMS = 289; JSCSS.tokens.INS = 290; JSCSS.tokens.PTS = 291; JSCSS.tokens.PCS = 292; JSCSS.tokens.DEGS = 293; JSCSS.tokens.RADS = 294; JSCSS.tokens.GRADS = 295; JSCSS.tokens.MSECS = 296; JSCSS.tokens.SECS = 297; JSCSS.tokens.HERZ = 298; JSCSS.tokens.KHERZ = 299; JSCSS.tokens.DIMEN = 300; JSCSS.tokens.PERCENTAGE = 301; JSCSS.tokens.FLOATTOKEN = 302; JSCSS.tokens.INTEGER = 303; JSCSS.tokens.URI = 304; JSCSS.tokens.FUNCTION = 305; JSCSS.tokens.NOTFUNCTION = 306; JSCSS.tokens.UNICODERANGE = 307; JSCSS.CSSLexer = function(source) { this.source = source; this.reset(); }; (function() { var lex = { defs: [ 'h', /[0-9a-fA-F]/, 'nonascii', /[^\0-\177]/, 'unicode', /\\{h}{1,6}[ \t\r\n\f]?/, 'escape', /{unicode}|\\[ -~]|\\[^\0-\177]/, 'nmstart', /[_a-zA-Z]|{nonascii}|{escape}/, 'nmchar', /[_a-zA-Z0-9-]|{nonascii}|{escape}/, 'hexcolor', /{h}{3}|{h}{6}/, 'ident', /-?{nmstart}{nmchar}*/, 'name', /{nmchar}+/, 'num', /[0-9]+|[0-9]*\.[0-9]+/, 'intnum', /[0-9]+/, 'url', /([!#$%&*-~]|{nonascii}|{escape})*/, 'w', /[ \t\r\n\f]*/, 'nl', /\n|\r\n|\r|\f/, 'string1', /\"([\t !#$%&(-~]|\\{nl}|\'|{nonascii}|{escape})*\"/, 'string2', /\'([\t !#$%&(-~]|\\{nl}|\"|{nonascii}|{escape})*\'/, 'string', /{string1}|{string2}/, 'range', /\?{1,6}|{h}(\?{0,5}|{h}(\?{0,4}|{h}(\?{0,3}|{h}(\?{0,2}|{h}(\??|{h})))))/, 'nth', /(-?[0-9]*n[\+-][0-9]+)|(-?[0-9]*n)/ ], rules: [ 'COMMENT', [/\/\*[^*]*\*+([^/*][^*]*\*+)*\//], 'WHITESPACE', [/[ \t\r\n\f]+/], 'SGML_CD', [/<!--|-->/], 'INCLUDES', ["~="], 'DASHMATCH', ["|="], 'BEGINSWITH', ["^="], 'ENDSWITH', ["$="], 'CONTAINS', ["*="], 'MEDIA_NOT', ["not", 'mediaquery'], 'MEDIA_ONLY', ["only", 'mediaquery'], 'MEDIA_AND', ["and", 'mediaquery'], 'STRING', [/{string}/], 'IDENT', [/{ident}/], 'NTH', [/{nth}/], 'HEX', [/#{hexcolor}/], 'IDSEL', [/#{ident}/], 'IMPORT_SYM', ["@import", undefined, 'mediaquery'], 'PAGE_SYM', ["@page"], 'MEDIA_SYM', ["@media", undefined, 'mediaquery'], 'FONT_FACE_SYM', ["@font-face"], 'CHARSET_SYM', ["@charset"], 'NAMESPACE_SYM', ["@namespace"], 'WEBKIT_RULE_SYM', ["@-webkit-rule"], 'WEBKIT_DECLS_SYM', ["@-webkit-decls"], 'WEBKIT_VALUE_SYM', ["@-webkit-value"], 'WEBKIT_MEDIAQUERY_SYM', ["@-webkit-mediaquery", undefined, 'mediaquery'], 'IMPORTANT_SYM', [/!{w}important/], 'EMS', [/{num}em/], 'QEMS', [/{num}__qem/], 'EXS', [/{num}ex/], 'PXS', [/{num}px/], 'CMS', [/{num}cm/], 'MMS', [/{num}mm/], 'INS', [/{num}in/], 'PTS', [/{num}pt/], 'PCS', [/{num}pc/], 'DEGS', [/{num}deg/], 'RADS', [/{num}rad/], 'GRADS', [/{num}grad/], 'MSECS', [/{num}ms/], 'SECS', [/{num}s/], 'HERZ', [/{num}Hz/], 'KHERZ', [/{num}kHz/], 'DIMEN', [/{num}{ident}/], 'PERCENTAGE', [/{num}%+/], 'INTEGER', [/{intnum}/], 'FLOATTOKEN', [/{num}/], 'NOTFUNCTION', ["not("], 'URI', [/(?:url\({w}{string}{w}\))|(?:url\({w}{url}{w}\))/], 'FUNCTION', [/{ident}\(/], 'UNICODERANGE', [/(?:U\+{range})|(?:U\+{h}{1,6}-{h}{1,6})/], 'MEDIAQUERY_END', [/{|;/, 'mediaquery', 'INITIAL', true], 'ALSO', [/./, undefined, undefined, true] ] }; var defs = {}; for (var i = 0; i < lex.defs.length; i += 2) { var n = lex.defs[i]; var def = lex.defs[i + 1] + ''; def = def.substring(1, def.length - 1); for (var o in defs) { def = def.replace(new RegExp('{' + o + '}', 'g'), defs[o]); } defs[n] = '(?:' + def + ')'; } var rules = lex.rules; for (var i = 0; i < rules.length; i += 2) { var n = rules[i]; var rule = rules[i + 1]; var reg = rule[0]; if (reg instanceof RegExp) { reg += ''; reg = reg.substring(1, reg.length - 1); for (var n in defs) { reg = reg.replace(new RegExp('{' + n + '}', 'g'), defs[n]); } rule[0] = new RegExp('^(?:' + reg + ')'); } } JSCSS.CSSLexer.rules = rules; })(); JSCSS.CSSLexer.prototype = { state: 'INITIAL', source: null, tokneBody: null, finished: false, next: function() { var rules = JSCSS.CSSLexer.rules; var m, matches = []; for (var i = 0; i < rules.length; i += 2) { var n = rules[i]; var rule = rules[i + 1]; var reg = rule[0]; if (rule[1] == undefined || rule[1] == this.state) { if (reg instanceof RegExp) { if (m = reg.exec(this.cur)) { matches.push([m[0].length, n, m[0], rule[2], rule[3]]) } } else { if (this.cur.indexOf(reg) == 0) { matches.push([reg.length, n, reg, rule[2], rule[3]]); } } } } var length = 0; var token = 0; var tokenBody = null; var state = undefined; var literal = false; for (var i = 0, l = matches.length; i < l; i++) { var match = matches[i]; if (length < match[0]) { length = match[0]; token = match[1]; tokenBody = match[2]; state = match[3]; literal = match[4]; } } if (state != undefined) { this.state = state; } this.tokenBody = tokenBody; this.cur = this.cur.substring(length); if (token == 0) { return 0; } return literal ? tokenBody.charCodeAt(0) : JSCSS.tokens[token]; }, reset: function() { this.cur = this.source; }, constructor: JSCSS.CSSLexer };
(追記)
ちょこっと修正、ハッシュを配列にした