#!/usr/bin/env python # ***** BEGIN LICENSE BLOCK ***** # Version: MPL 1.1/GPL 2.0/LGPL 2.1 # # The contents of this file are subject to the Mozilla Public License # Version 1.1 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # http://www.mozilla.org/MPL/ # # Software distributed under the License is distributed on an "AS IS" # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the # License for the specific language governing rights and limitations # under the License. # # The Original Code is Komodo code. # # The Initial Developer of the Original Code is ActiveState Software Inc. # Portions created by ActiveState Software Inc are Copyright (C) 2000-2007 # ActiveState Software Inc. All Rights Reserved. # # Contributor(s): # ActiveState Software Inc # # Alternatively, the contents of this file may be used under the terms of # either the GNU General Public License Version 2 or later (the "GPL"), or # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), # in which case the provisions of the GPL or the LGPL are applicable instead # of those above. If you wish to allow use of your version of this file only # under the terms of either the GPL or the LGPL, and not to allow others to # use your version of this file under the terms of the MPL, indicate your # decision by deleting the provisions above and replace them with the notice # and other provisions required by the GPL or the LGPL. If you do not delete # the provisions above, a recipient may use your version of this file under # the terms of any one of the MPL, the GPL or the LGPL. # # ***** END LICENSE BLOCK ***** # # Some CILE parsing utils extracted originally from phpcile and jscile # but mostly generally useful to CILEs implemented in Python. import codecs import locale import re import sys import os from codeintel2.common import CILEError #---- exported routines def tryEncoding(buffer, encoding): """ buffer, encoding -> encoding_buffer Attempts to encode the buffer using the specified encoding Returns None on failure, a Unicode version of the buffer on success. """ #log.info("_tryEncoding...%s",encoding) try: secret_decoder_ring = codecs.lookup(encoding)[1] except LookupError, e: # the encoding name doesn't exist, likely a pep263 failure # an example is using windows-1250 as the name return None try: (outdata,len) = secret_decoder_ring(buffer) return outdata except Exception, e: # Figure out the real exception types return None try: _defaultEncoding = locale.getdefaultlocale()[1] except ValueError: _defaultEncoding = None if _defaultEncoding is not None: _defaultEncoding = _defaultEncoding.lower() def getEncodedBuffer(buffer): decodedBuffer = tryEncoding(buffer, 'utf-8') if decodedBuffer is not None: return (decodedBuffer, 'utf-8', '') if _defaultEncoding is not None: decodedBuffer = tryEncoding(buffer, _defaultEncoding) if decodedBuffer is not None: return (decodedBuffer, _defaultEncoding, '') return (tryEncoding(buffer, 'iso8859-1'), 'iso8859-1', '') def urlencode_path(s): """URL-encode the given path string. This URL-encoding attempts to NOT encode characters that are typically legal path characters, e.g. '/', '\\', ':'. This is so that the result can more naturally be used as a filepath argument. The string must be an 8-bit string (that is all that URL-encoding can handle). """ from urllib import quote safe = os.sep + (os.altsep or '') + ":" return quote(s, safe=safe) #---- javadoc parsing _javadoc1 = re.compile(r'\s*\/\*(.*)\*\/', re.S) _javadoc2 = re.compile(r'^(\s*\*)', re.M) _linedoc = re.compile(r'^(\s*#|\s*\/\/)', re.M) _indent = re.compile(r'^([ \t]*)', re.M) _param = re.compile(r'^\s*@param\s+(?P[\w\\]+)\s+\$(?P\w+)(?:\s+?(?P.*?))?', re.M|re.U) _return = re.compile(r'^\s*@return\s+(?P[\w\\]+)(?:\s+(?P.*))?', re.M|re.U) def uncommentDocString(doc): # remove block style leading and end comments d = '\n'.join(re.findall(_javadoc1, doc)) if d: # remove starting * if javadoc style d = re.sub(_javadoc2,'',d) else: d = doc # remove line style comments d = re.sub(_linedoc,'',d) # trim preceeding blank lines. we dont want to trim the first non-blank line lines = d.split('\n') while len(lines) and not lines[0].strip(): lines.pop(0) d = '\n'.join(lines) # trip any blank end lines d = d.rstrip() # guess the indent size spaces = re.findall(_indent, d) indent = spaces[0] for s in spaces: if len(s) and len(s) < indent: indent = len(s) # dedent the block if not indent: return d dedent = re.compile(r'^([ \t]{%d})' % indent, re.M) d = re.sub(dedent, '', d) return d def parseDocString(doc): d = uncommentDocString(doc) params = re.findall(_param, d) result = re.findall(_return, d) if result: result = result[0] return (d, params, result) SKIPTOK = 0x01 # don't consider this a token that is to be considered a part of the grammar, like '\n' MAPTOK = 0x02 # use the token associated with the pattern when it matches EXECFN= 0x04 # execute the function associated with the pattern when it matches USETXT = 0x08 # if you match a single character and want its ascii value to be the token class recollector: def __init__(self): self.res = {} self.regs = {} def add(self, name, reg, mods=None ): self.regs[name] = reg % self.regs #print "%s = %s" % (name, self.regs[name]) if mods: self.res[name] = re.compile(self.regs[name], mods) # check that it is valid else: self.res[name] = re.compile(self.regs[name]) # check that it is valid # Lexer class borrowed from the PyLRd project, # http://starship.python.net/crew/scott/PyLR.html class Lexer: eof = -1 def __init__(self): self.tokenmap = {} self.prgmap = {} self.prglist = [] self.lasttok = -1 self.text = "" self.textindex = 0 self.tokennum2name = {} def nexttok(self): self.lasttok = self.lasttok + 1 return self.lasttok def settext(self, t): self.text = t self.textindex = 0 def addmatch(self, prg, func=None, tokname="", attributes=MAPTOK|EXECFN): self.prglist.append(prg) tok = -2 if not func: attributes = attributes & ~EXECFN if not tokname: attributes = attributes & ~MAPTOK if attributes & MAPTOK: self.tokenmap[tokname] = tok = self.nexttok() else: tok = self.nexttok() self.prgmap[prg] = tok, attributes, func self.tokennum2name[tok] = tokname def scan(self): for prg in self.prglist: mo = prg.match(self.text, self.textindex) if not mo: continue self.textindex = self.textindex + len(mo.group(0)) tmpres = mo.group(0) t, attributes, fn = self.prgmap[prg] #log.debug("'%s' token: %r", self.tokennum2name[t], tmpres) if attributes & EXECFN: tmpres = apply(fn, (mo,)) if attributes & USETXT: t = ord(mo.group(0)[0]) return (t, tmpres) if self.textindex >= len(self.text): return (self.eof, "") raise CILEError("Syntax Error in lexer")