#!python # ***** BEGIN LICENSE BLOCK ***** # Version: MPL 1.1/GPL 2.0/LGPL 2.1 # # The contents of this file are subject to the Mozilla Public License # Version 1.1 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # http://www.mozilla.org/MPL/ # # Software distributed under the License is distributed on an "AS IS" # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the # License for the specific language governing rights and limitations # under the License. # # The Original Code is Komodo code. # # The Initial Developer of the Original Code is ActiveState Software Inc. # Portions created by ActiveState Software Inc are Copyright (C) 2000-2007 # ActiveState Software Inc. All Rights Reserved. # # Contributor(s): # ActiveState Software Inc # # Alternatively, the contents of this file may be used under the terms of # either the GNU General Public License Version 2 or later (the "GPL"), or # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), # in which case the provisions of the GPL or the LGPL are applicable instead # of those above. If you wish to allow use of your version of this file only # under the terms of either the GPL or the LGPL, and not to allow others to # use your version of this file under the terms of the MPL, indicate your # decision by deleting the provisions above and replace them with the notice # and other provisions required by the GPL or the LGPL. If you do not delete # the provisions above, a recipient may use your version of this file under # the terms of any one of the MPL, the GPL or the LGPL. # # ***** END LICENSE BLOCK ***** from xpcom import components, ServerException from xpcom.server import UnwrapObject from koLintResult import KoLintResult from koLintResults import koLintResults from xpcom.server.enumerator import * import os, sys, re import StringIO # Do not use cStringIO! See html5 class for an explanation from zope.cachedescriptors.property import LazyClassAttribute import eollib import html5lib from html5lib.constants import E as html5libErrorDict import process import logging logging.basicConfig() log = logging.getLogger("KoHTMLLinter") #log.setLevel(logging.DEBUG) _doctype_re = re.compile("", re.IGNORECASE|re.DOTALL) class MultiLangStringBuilder(dict): """ The HTML linter takes in its document's text, and then writes out subsets of that text destined for each separate language's linter. It also sometimes needs to wrap snippets, usually with either a function definition wrapper or a CSS declaration, and that causes the coordinates of some of the text the linter sees to deviate from the text the user sees. This class pushes the injected text into the pending white space, when possible. """ def __init__(self, names): dict.__init__(self, dict([(k, []) for k in names])) self._pendingWhiteSpace = dict([(k, '') for k in names]) def addWhiteSpace(self, name, s): self._pendingWhiteSpace[name] += s def _pushBlanks(self, name): if self._pendingWhiteSpace[name]: self[name].append(self._pendingWhiteSpace[name]) self._pendingWhiteSpace[name] = '' def __setitem__(self, name, s): self._pushBlanks(name) self[name].append(s) def finish(self): for name in self.keys(): self._pushBlanks(name) def last_text_matches_pattern(self, name, ptn): if not self[name]: # If there is no text at all, count as True return True for s in self[name][-1 : -len(self[name]) - 1: -1]: s1 = s.rstrip() if not s1: # Ignore sequences with white-space only continue if ptn.search(s1): return True return False def replace_ending_white_space(self, name, newStr, lineNum): if not newStr: return numChars = len(newStr) ending_spaces_re = re.compile(r'[ \t]{1,%d}\Z' % numChars) m = ending_spaces_re.search(self._pendingWhiteSpace[name]) if m: mglen = len(m.group()) if mglen >= numChars: self[name].append(self._pendingWhiteSpace[name][:-numChars]) else: self[name].append(self._pendingWhiteSpace[name][:-mglen]) # Manually push everything to the self._pendingWhiteSpace[name] = '' self[name].append(newStr) else: # Make sure the non-white item is preceded by start or a space. if self[name] and not self._pendingWhiteSpace[name] and not self[name][-1].isspace(): self._pendingWhiteSpace[name] += ' ' self[name] = newStr #---- component implementation class _CommonHTMLLinter(object): _com_interfaces_ = [components.interfaces.koILinter] def __init__(self): self._koLintService = components.classes["@activestate.com/koLintService;1"].getService(components.interfaces.koILintService) self._lintersByLangName = { "CSS": self._koLintService.getLinterForLanguage("CSS"), "JavaScript": self._koLintService.getLinterForLanguage("JavaScript"), } _nonNewlineMatcher = re.compile(r'[^\r\n]') def _spaceOutNonNewlines(self, markup): return self._nonNewlineMatcher.sub(' ', markup) def _linterByName(self, langName, currentLinters): if langName in currentLinters: return currentLinters[langName] if langName not in self._lintersByLangName: try: linter = self._koLintService.getLinterForLanguage(langName) self._lintersByLangName[langName] = linter except: log.error("No linter for language %s", langName) linter = None self._lintersByLangName[langName] = linter currentLinters[langName] = linter return self._lintersByLangName[langName] def _getMappedName(self, name): return self._mappedNames and self._mappedNames.get(name, name) or name def _blankOutOneLiners(self, code): if "\n" in code.strip(): return code return self._spaceOutNonNewlines(code) def addLineNumbers(self, s, currLineNum): lines = s.splitlines(True) nums = range(0, len(lines)) return "".join(["%4d:%s" % (num + currLineNum, line) for num, line in zip(nums, lines)]) def _trim(self, s): if len(s) <= 500: return s return s[:100] + "..." + s[-100:] def _getLastMarkupText(self, koDoc, transitionPoints, i, textAsBytes): """ Return the most recent chunk of markup text """ startPt = transitionPoints[i] i -= 1 while i >= 0: endPt = startPt startPt = transitionPoints[i] if startPt == endPt: continue origLangName = koDoc.languageForPosition(startPt) if origLangName in ('HTML', 'HTML5', 'XUL', 'XBL'): currText = textAsBytes[startPt:endPt] return currText i -= 1 return "" # Give up. @LazyClassAttribute def _ends_with_cdata_re(self): return re.compile(r'(?:\s*\]\]>|\s*-->)+\s*\Z', re.DOTALL) @LazyClassAttribute def _ends_with_gt(self): return re.compile(r'>\s*\Z'); @LazyClassAttribute def _ends_with_quote_re(self): return re.compile(r'[\"\']\Z'); @LazyClassAttribute def _ends_with_zero(self): return re.compile(r'0\s*\Z', re.DOTALL) @LazyClassAttribute def _event_re(self): return re.compile(r'\bevent\b') @LazyClassAttribute def _function_re(self): return re.compile(r'\bfunction\b') @LazyClassAttribute def _js_code_end_re(self): return re.compile(r'[\};]\s*$', re.DOTALL) @LazyClassAttribute def _nl_re(self): return re.compile('\\n') @LazyClassAttribute def _return_re(self): return re.compile(r'\breturn\b') @LazyClassAttribute def _script_start_re(self): return re.compile(r']*>\s*\Z', re.DOTALL) @LazyClassAttribute def _starts_with_cdata_re(self): return re.compile(r'(?:\s*]*>\s*\Z', re.DOTALL) @LazyClassAttribute def _xbl_handler_re(self): return re.compile(r'<(?:\w+:)?handler[^>]*>\s*\Z', re.DOTALL) @LazyClassAttribute def _xbl_method_re(self): return re.compile(r'<(?:\w+:)?method\b.*?]*>\s*\Z', re.DOTALL) @LazyClassAttribute def _xbl_method_name_re(self): return re.compile(r'<(?:\w+:)?method\b.*?name\s*=\s*[\'\"](\w+)', re.DOTALL) @LazyClassAttribute def _xbl_method_parameter_re(self): return re.compile(r'<(?:\w+:)?parameter\b.*?name\s*=\s*[\'\"](\w+)[\'\"].*?>', re.DOTALL) @LazyClassAttribute def _xbl_setter_re(self): return re.compile(r'<(?:\w+:)?setter[^>]*>\s*\Z', re.DOTALL) @LazyClassAttribute def _xml_decln_re(self): return re.compile(r'(<)<\?\?>(\?.*)', re.DOTALL) # Matching state values. Tracking when we're in CSS or JS, and when we're in SSL code. _IN_M = 0x0001 _IN_JS_SCRIPT = 0x0002 _IN_JS_FUNCTION_DEF = 0x0004 _IN_JS_FUNCTION_DEF_INVOCN = 0x0008 _IN_JS_OTHER = 0x0010 _IN_JS_SQUELCH = 0x0020 _IN_JS_EMIT = _IN_JS_SCRIPT|_IN_JS_FUNCTION_DEF|_IN_JS_FUNCTION_DEF_INVOCN|_IN_JS_OTHER _IN_JS = _IN_JS_EMIT|_IN_JS_SQUELCH _IN_CSS_STYLE = 0x0040 _IN_CSS_ATTR = 0x0080 _IN_CSS_SQUELCH = 0x0100 _IN_CSS_EMIT = _IN_CSS_STYLE|_IN_CSS_ATTR _IN_CSS = _IN_CSS_EMIT|_IN_CSS_ATTR _IN_SSL_EMITTER = 0x0200 _IN_SSL_BLOCK = 0x0400 _IN_SSL = _IN_SSL_EMITTER|_IN_SSL_BLOCK _take_all_languages = ("PHP",) # Hand these SSL languages the whole document # Are there others beside PHP? # This pattern is for bug 95364, support Rails-hack form of ERB to # support forms like <%= form_tag ... do |f| %>...<% end %> # Note mismatched <%= ... %><% %> -- this is deliberate in Rails 3 @LazyClassAttribute def RERB_Block_PTN(self): return re.compile(r'.*?\s*(?:do|\{)(?:\s*\|[^|]*\|)?\s*\Z') def _lint_common_html_request(self, request, udlMapping=None, linters=None, TPLInfo=None, startCheck=None): """ Hand off bits of text to each lexer. @param udlMapping: Example: udlMapping={"Perl":"Mason"} -- used for mapping SSL code to the actual multi-lang language @param startCheck: Some languages need to insert a doctype. If there's a startCheck, it contains a language, a pattern, and text to insert if the pattern fails to match @param TPLInfo (languageName, emitPattern) If we're matching language and we match , it means that an SSL language will be inserting some text into the eventual HTML document. If this follows CSS or JS, we need to insert some text to keep the respective CSS/JS linter happy. The markup lexer (HTML/HTML5/XML) sees all the core text: markup, CSS, JS PHP is handled off everything All other languages see only their own language. There are tricks for wrapping bits of JS and CSS, see below. Start in state _IN_M The JS lexer gets text between script tags passed as is. But then there are other wrinkles: M -> JS after: /on\w+\s*=\s*["']/ :: If the terms 'return' and 'event' aren't used here, insert a ';' if needed Otherwise, insert 'function _kof##() {';, => _IN_JS_FUNCTION_DEF and add an 'event' arg. M -> JS after: /\s*/ :: => _IN_JS_SQUELCH M -> JS after: /\s*/ :: blank _IN_JS_SCRIPT M -> JS after: />\s*/ :: insert 'function _kof##() {'; blank _IN_JS_FUNCTION_DEF_INVOCN M -> JS after: other: , => _IN_JS_SQUELCH M -> CSS after: />\s*/ :: nothing, => _IN_CSS_STYLE M -> CSS after: ["'] :: insert '_x {', => _IN_CSS_ATTR M -> CSS after: other: => _IN_CSS_SQUELCH land at M, currState & _IN_JS: blank /]]>\s*/ _IN_JS_SQUELCH: emit nothing _IN_JS_SCRIPT: emit nothing _IN_JS_FUNCTION_DEF: emit '}' land at M, currState & _IN_CSS: _IN_CSS_STYLE: emit nothing _IN_CSS_ATTR: emit '}' (currState & _IN_JS_EMIT|_IN_CSS_EMIT) on TPL_EMITTER_START: insert '0' (number, not a string)' add state _IN_SSL_EMITTER find TPL_BLOCK_START => _IN_SSL_BLOCK SSL code emitted to SSL lang _IN_SSL_BLOCK find TPL_EMITTER_START => _IN_SSL_EMITTER SSL code not emitted to SSL lang when _IN_SSL_EMITTER is on find TPL_END => drop _IN_SSL Two points about this state machine: 1. SSL_EMITTERS start with patterns like /<\?=/ or /<\?php\s+echo\b/. They emit a value that the browser will see. SSL_BLOCK doesn't emit code, so it all gets squelched. 2. States will overlap across families. For example, we can have JS_SQUELCH and SSL_BLOCK at the same time 3. Whenever we end up in markup, we can end whatever is pending, and clear all overlapped states, ending at _IN_M """ self._mappedNames = udlMapping # These are lines where we added text. If the linter complains about # any of these lines, make sure the error message spans the entire line. self._emittedCodeLineNumbers = set() # These refer to lines where the SSL and JS and/or CSS are interleaved, # which could lead to possible false-positives. Just don't report JS/CSS # errors/warnings on these lines. self._multiLanguageLineNumbers = set() lintersByName = {} # Copy working set of linters into a local var lintersByName.update(self._lintersByLangName) if linters: lintersByName.update(linters) koDoc = request.koDoc # koDoc is a proxied object koDoc_language = koDoc.language transitionPoints = koDoc.getLanguageTransitionPoints(0, koDoc.bufferLength) languageNamesAtTransitionPoints = [koDoc.languageForPosition(pt) for pt in transitionPoints[:-2]] if not languageNamesAtTransitionPoints: languageNamesAtTransitionPoints = [koDoc.languageForPosition(0)] # We need to lint the utf-8 representation to keep coordinates # in sync with Scintilla # request.content contains a Unicode representation, even if the # buffer's encoding is utf-8 -- content is an AString textAsBytes = request.content.encode("utf-8") uniqueLanguageNames = dict([(k, None) for k in languageNamesAtTransitionPoints]) if udlMapping: for targetName in udlMapping.values(): try: uniqueLanguageNames[targetName] = [] except TypeError: log.debug("udlMapping:%s, targetName:%r", udlMapping, targetName) uniqueLanguageNames = uniqueLanguageNames.keys() #log.debug("transitionPoints:%s", transitionPoints) #log.debug("uniqueLanguageNames:%s", uniqueLanguageNames) ###bytesByLang =OLD### dict([(k, []) for k in uniqueLanguageNames]) bytesByLang = MultiLangStringBuilder(uniqueLanguageNames) lim = len(transitionPoints) endPt = 0 htmlAllowedNames = ("HTML", "HTML5", "CSS", "JavaScript", "XML") currState = self._IN_M prevText = "" startLang = languageNamesAtTransitionPoints[0] if startLang in htmlAllowedNames: prevLanguageFamily = {"JavaScript":"CSL", "CSS":"CSS"}.get(startLang, "M") else: #XXX: One day, distinguish SSL from TPL prevLanguageFamily = "SSL" currLineNum = 1 js_func_num = 0 js_func_name_prefix = "__kof_" for i in range(1, lim): startPt = endPt endPt = transitionPoints[i] if startPt == endPt: continue currText = textAsBytes[startPt:endPt] numNewLinesInCurrText = len(self._nl_re.findall(currText)) origLangName = koDoc.languageForPosition(startPt) langName = self._getMappedName(origLangName) #log.debug("segment: raw lang name: %s, lang:%s, %d:%d [[%s]]", # koDoc.languageForPosition(startPt), # langName, startPt, endPt, self.addLineNumbers(currText, currLineNum)) if TPLInfo and origLangName == TPLInfo[0]: for j in range(currLineNum, currLineNum + numNewLinesInCurrText + 1): self._multiLanguageLineNumbers.add(j) squelchedText = self._spaceOutNonNewlines(currText) for name in bytesByLang.keys(): if origLangName == "CSS" and langName == name: if currState & self._IN_CSS: # We're in a run of CSS, could be separated by SSL blocks pass else: if prevLanguageFamily != "M": # Handle the case of