# # ***** BEGIN LICENSE BLOCK ***** # Version: MPL 1.1/GPL 2.0/LGPL 2.1 # # The contents of this file are subject to the Mozilla Public License # Version 1.1 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # http://www.mozilla.org/MPL/ # # Software distributed under the License is distributed on an "AS IS" # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the # License for the specific language governing rights and limitations # under the License. # # The Original Code is Komodo code. # # The Initial Developer of the Original Code is ActiveState Software Inc. # Portions created by ActiveState Software Inc are Copyright (C) 2000-2007 # ActiveState Software Inc. All Rights Reserved. # # Contributor(s): # ActiveState Software Inc # # Alternatively, the contents of this file may be used under the terms of # either the GNU General Public License Version 2 or later (the "GPL"), or # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), # in which case the provisions of the GPL or the LGPL are applicable instead # of those above. If you wish to allow use of your version of this file only # under the terms of either the GPL or the LGPL, and not to allow others to # use your version of this file under the terms of the MPL, indicate your # decision by deleting the provisions above and replace them with the notice # and other provisions required by the GPL or the LGPL. If you do not delete # the provisions above, a recipient may use your version of this file under # the terms of any one of the MPL, the GPL or the LGPL. # # ***** END LICENSE BLOCK ***** """A find/replace backend for Komodo.""" import os from os.path import (expanduser, join, basename, exists, dirname, abspath, isdir, islink, lexists, normpath, realpath) import time import sys import re import logging import codecs from glob import glob import cPickle as pickle from hashlib import md5 from pprint import pprint, pformat try: from scandir import walk as os_walk except (ImportError, IOError): logging.getLogger("findlib2").warn("Unable to import scandir - defaulting to os.walk") os_walk = os.walk try: import textinfo except ImportError: kopylib_dir = join(dirname(dirname(abspath(__file__))), "python-sitelib") sys.path.insert(0, kopylib_dir) import textinfo import difflibex import warnings warnings.simplefilter("ignore", textinfo.ChardetImportWarning) # bug 77562 #---- exceptions and globals class FindError(Exception): pass log = logging.getLogger("findlib2") #log.setLevel(logging.DEBUG) #---- primary API methods def find(paths, includes=None, excludes=None, env=None): """List paths matching the given filters (a la GNU find). @param paths {sequence|iterator} is a sequence of paths to search. @param includes {list} is a sequence of 2-tuples defining include filters on textinfo data: (, ). @param excludes {list} is a sequence of 2-tuples defining exclude filters on textinfo data: (, ). @param env {runtime environment} """ if not includes and not excludes: # Quick case. for path in paths: yield path else: all_fields = set(k for k,v in includes) all_fields.update(k for k,v in excludes) quick_determine_lang = False if all_fields == set('lang'): # Special quick case. quick_determine_lang = True lidb = textinfo.get_default_lidb() for path in paths: ti = textinfo.TextInfo.init_from_path(path, lidb=lidb, follow_symlinks=True, quick_determine_lang=quick_determine_lang, env=env) if includes: unmatched_includes = [field for field, value in includes if getattr(ti, field, None) != value] if unmatched_includes: #log.debug("excluding path not matching '%s' include(s): %r", # "', '".join(unmatched_includes), path) continue if excludes: matched_excludes = [field for field, value in excludes if getattr(ti, field, None) == value] if matched_excludes: #log.debug("excluding path matching '%s' exclude(s): %r", # "', '".join(matched_excludes), path) continue yield path def grep(regex, paths, files_with_matches=False, treat_binary_files_as_text=False, skip_unknown_lang_paths=False, skip_filesizes_larger_than=None, first_on_line=False, includes=None, excludes=None, textInfoFactory=None, env=None): """Grep for `regex` in the given paths. @param regex {regex} is the regular expression to search for. @param paths {sequence|iterator} is a sequence of paths to search. @param files_with_matches {boolean} if true indicates that only a hit for each file with one or more matches should be returned. Default is false. @param treat_binary_files_as_text {boolean} indicates that binary files should be search. By default they are skipped. @param skip_unknown_lang_paths {boolean} can be set True to skip grepping in files for which the lang could not be determined. This is useful when doing *replacements* to be safe about not wrecking havoc in binary files that happen to look like plain text. @param skip_filesizes_larger_than {int} in bytes, if set this is used to skip files that are larger in size than this value. @param first_on_line {boolean} A boolean indicating, if True, that only the first hit on a line should be replaced. Default is False. (This is to support Vi's replace with the 'g' flag.) @param includes {list} is a sequence of 2-tuples defining include filters on textinfo data: (, ). @param excludes {list} is a sequence of 2-tuples defining exclude filters on textinfo data: (, ). @param textInfoFactory {object} either a standard textinfo.TextInfo object, or a duck-type-equivalent object based on a loaded, modified document. @param env {runtime environment} """ if log.isEnabledFor(logging.DEBUG): log.debug("grep %r", str_from_regex_info(regex)) for path in paths: path = normpath(path) if skip_filesizes_larger_than: try: statinfo = os.stat(path) if statinfo.st_size > skip_filesizes_larger_than: yield SkipLargeFilePath(path, statinfo.st_size) continue except EnvironmentError, ex: yield SkipPath(path, "error determining file info: %s" % ex) continue try: if textInfoFactory: ti = textInfoFactory.init_from_path(path, follow_symlinks=True, env=env) else: ti = textinfo.TextInfo.init_from_path(path, follow_symlinks=True, env=env) ti.is_loaded_path = False except EnvironmentError, ex: yield SkipPath(path, "error determining file info: %s" % ex) continue if skip_unknown_lang_paths and ti.lang is None: yield SkipUnknownLangPath(path) continue if includes: unmatched_includes = [field for field, value in includes if getattr(ti, field, None) != value] if unmatched_includes: #log.debug("excluding path not matching '%s' include(s): %r", # "', '".join(unmatched_includes), path) continue if excludes: matched_excludes = [field for field, value in excludes if getattr(ti, field, None) == value] if matched_excludes: #log.debug("excluding path matching '%s' exclude(s): %r", # "', '".join(matched_excludes), path) continue if ti.is_text: if ti.text is None: yield SkipPath(path, "couldn't decode '%s' text" % ti.encoding) continue text = ti.text elif treat_binary_files_as_text: f = open(path, 'rb') text = f.read() f.close() else: yield SkipBinaryPath(path) continue accessor = _TextAccessor(text) have_hits_in_path = False last_hit_line = None for match in find_all_matches(regex, text, start=0, end=None): if files_with_matches: yield PathHit(path) break hit = FindHit(path, ti, match, accessor) if first_on_line: line, _ = hit.line_num_range if line == last_hit_line: continue last_hit_line = line yield hit have_hits_in_path = True if not have_hits_in_path: yield SkipNoHitsInPath(path) def replace(regex, repl, paths, skip_filesizes_larger_than=None, first_on_line=False, includes=None, excludes=None, do_smart_replace=False, textInfoFactory=None, summary=None, env=None): """Make the given regex replacement in the given paths. This generates a stream of `Event`s. The main such event is `ReplaceHitGroup` which is a grouping of all replacements for a single path. **Note:** To actually make the change you must call `event.commit()` on these hits. This allows one to manage confirmation of replacements. Example usage: journal = None try: for event in replace2(re.compile('foo'), 'bar', ['blah.txt']): if isinstance(event, StartJournal): journal = event.journal elif not isinstance(event, ReplaceHitGroup): # "SkipPath" events are sent for files with no hits, etc. log.debug(event) #... possibly confirm replacement with user event.commit() finally: journal.close() @param regex {regular expression} is the regex with which to search @param repl {string} is the replacement string @param paths {generator} is the list of paths to process @param skip_filesizes_larger_than {int} in bytes, if set this is used to skip files that are larger in size than this value. @param first_on_line {boolean} A boolean indicating, if True, that only the first hit on a line should be replaced. Default is False. (This is to support Vi's replace with the 'g' flag.) @param includes {list} is a sequence of 2-tuples defining include filters on textinfo data: (, ). @param excludes {list} is a sequence of 2-tuples defining exclude filters on textinfo data: (, ). @param textInfoFactory {object} returns either a standard textinfo.TextInfo object, or returns a duck-type-equivalent object based on a loaded, modified document. @param do_smart_replace {boolean} preserve case for initial cap match, and all-caps match @param summary {str} an optional summary string for the replacement journal. @param env {runtime environment} """ journal = None grepper = grep(regex, paths, skip_unknown_lang_paths=True, skip_filesizes_larger_than=skip_filesizes_larger_than, first_on_line=first_on_line, includes=includes, excludes=excludes, textInfoFactory=textInfoFactory, env=env) for fhits in grouped_by_path(grepper): if not isinstance(fhits[0], Hit): yield fhits[0] continue # 'fhits' is a list of FindHits for a single path. path = fhits[0].path encoding = fhits[0].encoding_with_bom # Calculate the change. before_text = fhits[0].accessor.text if first_on_line: # Need to manually calculate the change from each find hit. # Algoritm: Walk backwards through the find hits, appending # unmodified-bit, replacement, unmodified-bit, ... # in reversed order, then reverse and join the pieces. # Perf notes: # (a) Building a list and joining once is much faster than # joining a string many times. # (b) `lst.append(item)` is much faster than `lst.insert(0, item)`. bits = [] # list of tail-end bits of the file idx = len(before_text) for hit in reversed(fhits): bits.append(before_text[hit.end_pos:idx]) expanded_repl = hit.match.expand(repl) if do_smart_replace: bits.append(do_smart_conversion(expanded_repl, repl)) else: bits.append(expanded_repl) idx = hit.start_pos bits.append(before_text[:idx]) after_text = ''.join(reversed(bits)) else: if do_smart_replace: # The pattern was plain text, not a regex or glob, so we can # safely wrap it with parens. It might have \b wrappers, # but we can safely add capture-parens, so the split works fixed_regex = re.compile("(%s)" % regex.pattern, regex.flags) pieces = fixed_regex.split(before_text) final_pieces = [] for before_part, matched_part in zip(pieces[0::2], pieces[1::2]): final_pieces.append(before_part) final_pieces.append(do_smart_conversion(matched_part, repl)) final_pieces.append(pieces[-1]) after_text = "".join(final_pieces) else: after_text = regex.sub(repl, before_text) if before_text == after_text: continue # If this is the first path, start the journal for this # replacment (for undo). if journal is None: if summary is None: summary = str_from_regex_info(regex, repl) journal = Journal.create(summary) yield StartJournal(journal) yield ReplaceHitGroup(regex, repl, path, encoding, before_text, fhits, after_text, journal) def undo_replace(journal_id, dry_run=False, loadedFileManager=None): """Undo the given replacement. For now the undo ability is the easy way out: it only works if none of the files in the replacement have been subsequently modified. Undo process: 1. Get the journal (yields a LoadJournal event with the loaded journal). 2. Ensure that none of the files to be reverted have been subsequently modified (via MD5 check). 3. Undo each replacement (yields a JournalReplaceRecord event for each replacement). 4. Sanity check that the undo works (can compare MD5 via journaled value). 5. Remove the journal. TODO: Support undo on modified files if the subsequent modifications are not on the same lines as any of the replacements. TODO: (future) Support undo on subsequent modification, skipping only those lines where there are conflicts. """ journal = Journal.load(journal_id) log.debug("undo replace `%s'", journal.id) yield LoadJournal(journal) # 2. Ensure can complete the undo. changed_paths = [] if loadedFileManager is None: disk_based_groups = journal dirty_buffer_groups = [] else: disk_based_groups = [] dirty_buffer_groups = [] for group in journal: if isinstance(group, JournalReplaceLoadedBufferRecord): dirty_buffer_groups.append(group) elif isinstance(group, JournalReplaceRecord): disk_based_groups.append(group) else: raise FindError("Unknown kind of group") for group in dirty_buffer_groups: if loadedFileManager.bufferHasChanged(group.path, group.after_md5sum): changed_paths.append(group.path) for group in disk_based_groups: f = open(group.path, 'rb') #TODO: handle path being missing bytes = f.read() f.close() md5sum = md5(bytes).hexdigest() if md5sum != group.after_md5sum: changed_paths.append(group.path) if changed_paths: raise FindError("Cannot undo replacement because the following " "files have been subsequently modified: '%s'" % "', '".join(changed_paths)) # 3. Undo each replacement. #TODO: Bullet-proof backups to '~/.frep/backups//' and # restoration on any failure. paths_failing_sanity_check = [] for group in dirty_buffer_groups: loadedFileManager.undoChanges(group.path) yield group for group in disk_based_groups: log.debug("undo changes to `%s'", group.path) f = codecs.open(group.path, 'rb', group.encoding) text = f.read() f.close() # Algoritm: Walk backwards through the replacements, appending # unmodified-bit, original, unmodified-bit, ... # in reversed order, then reverse and join the pieces. # Perf notes: # (a) Building a list and joining once is much faster than # joining a string many times. # (b) `lst.append(item)` is much faster than `lst.insert(0, item)`. bits = [] # list of tail-end bits of the file idx = len(text) for hit in reversed(group.rhits): bits.append(text[hit.end_pos:idx]) bits.append(hit.before) idx = hit.start_pos bits.append(text[:idx]) text = ''.join(reversed(bits)) # 4. Sanity check. bytes = text.encode(group.encoding) md5sum = md5(bytes).hexdigest() log.debug("undo md5 check: before=%s, after undo=%s", group.before_md5sum, md5sum) if md5sum != group.before_md5sum: paths_failing_sanity_check.append(group.path) # Write out the reverted content. if not dry_run: f = open(group.path, 'wb') f.write(bytes) f.close() yield group # 4. Sanity check. #TODO: Something more useful than just warning. Yield event? if paths_failing_sanity_check: log.warn("sanity check failure: the following file(s) weren't " "restored to their exact state before the replacement: '%s'", "', '".join(paths_failing_sanity_check)) # 5. Remove the journal. if not dry_run: try: os.remove(journal.path) except EnvironmentError, ex: log.warn("couldn't remove journal '%s': %s", journal.path, ex) if exists(journal.summary_path): try: os.remove(journal.summary_path) except EnvironmentError, ex: log.warn("couldn't remove journal summary '%s': %s", journal.summary_path, ex) #---- public API functions the generally deal in re MatchObject's def find_all_matches(regex, text, start=0, end=None): """Generate a regex `MatchObject` for each match of `regex` in the given `text`. """ if end is None: end = len(text) while True: match = regex.search(text, start, end) if match: yield match if match.start() - match.end() == 0: start = match.end() + 1 if start > end: break else: start = match.end() else: break def find_all_matches_bwd(regex, text, start=0, end=None): """Generate a regex `MatchObject` for each match of `regex` in the given `text` *backwards* from `end` to `start`. """ matches = [m for m in find_all_matches(regex, text, start, end)] for match in reversed(matches): yield match #---- event/hit class hierarchy # In general the primary methods (grep(), find(), replace()) generate # a stream of Event subclass instances. The general hierarchy is: # # Event # virtual base class # Hit # base class for a "hit", i.e. found something # FindHit # ReplaceHit # ... # # Other non-hit events. # SkipPath # StartJournal def grouped_by_path(events): """Group "Hit" events in the given find event stream by path Non-Hits are in their own group. Hits are grouped by path. """ group_path = None group = [] # group of FindHits for a single path for event in events: if not isinstance(event, Hit): if group: yield group group_path = None group = [] yield [event] continue if group_path is None: group_path = event.path group = [] elif event.path != group_path: yield group group_path = event.path group = [] group.append(event) if group: yield group class Event(object): pass class SkipPath(Event): def __init__(self, path, reason): self.path = path self.reason = reason def __repr__(self): SUMMARY_LEN = 40 reason_summary = self.reason if len(reason_summary) > SUMMARY_LEN: reason_summary = reason_summary[:SUMMARY_LEN-3] + "..." return "" % (self.path, reason_summary) def __str__(self): SUMMARY_LEN = 40 reason_summary = self.reason if len(reason_summary) > SUMMARY_LEN: reason_summary = reason_summary[:SUMMARY_LEN-3] + "..." return "skipped `%s' (%s)" % (self.path, reason_summary) class SkipBinaryPath(SkipPath): """Event yielded when skipping a binary path during grep() or replace().""" def __init__(self, path): SkipPath.__init__(self, path, "binary") class SkipUnknownLangPath(SkipPath): """Event yielded when skipping a path because its lang could not be identified. """ def __init__(self, path): SkipPath.__init__(self, path, "unknown language") class SkipLargeFilePath(SkipPath): """Event yielded when skipping a path because its filesize is too large.""" def __init__(self, path, size): SkipPath.__init__(self, path, "too large: %d bytes" % (size)) self.size = size class SkipNoHitsInPath(SkipPath): """Event yielded when skipping a path with no hits during grep() or replace(). """ def __init__(self, path): SkipPath.__init__(self, path, "no hits") class ReplaceDiff(Event): """Event giving the diff for an impending replacement.""" def __init__(self, path, diff): self.path = path self.diff = diff def __repr__(self): return "" % self.path class StartJournal(Event): """Event signalling the creation of a replacement journal.""" def __init__(self, journal): self.journal = journal def __repr__(self): return "" % self.journal.id class LoadJournal(Event): """First event emitted by undo_replace() after load of the journal.""" def __init__(self, journal): self.journal = journal def __repr__(self): return "" % self.journal.id class Hit(Event): path = None class PathHit(Hit): def __init__(self, path): self.path = path class _HitWithAccessorMixin(object): """Provide some utilities for getting text context information given a `self.accessor'. """ __line_num_range_cache = None @property def line_num_range(self): if self.__line_num_range_cache is None: start = self.accessor.line_from_pos(self.start_pos) end = self.accessor.line_from_pos(self.end_pos) self.__line_num_range_cache = (start, end) return self.__line_num_range_cache __line_and_col_num_range_cache = None @property def line_and_col_num_range(self): if self.__line_and_col_num_range_cache is None: start = self.accessor.line_and_col_from_pos(self.start_pos) end = self.accessor.line_and_col_from_pos(self.end_pos) self.__line_and_col_num_range_cache = (start, end) return self.__line_and_col_num_range_cache __start_line_and_col_num_cache = None @property def start_line_and_col_num(self): if self.__start_line_and_col_num_cache is None: self.__start_line_and_col_num_cache \ = self.accessor.line_and_col_from_pos(self.start_pos) return self.__start_line_and_col_num_cache @property def lines(self): start, end = self.line_num_range for line_num in range(start, end+1): start_pos = self.accessor.pos_from_line_and_col(line_num, 0) try: end_pos = self.accessor.pos_from_line_and_col(line_num+1, 0) except IndexError: end_pos = len(self.accessor.text) yield self.accessor.text_range(start_pos, end_pos) def lines_with_context(self, n): start, end = self.line_num_range context_start = max(0, start - n) context_end = max(0, end + n) for line_num in range(context_start, context_end+1): type = (start <= line_num <= end and "hit" or "context") start_pos = self.accessor.pos_from_line_and_col(line_num, 0) try: end_pos = self.accessor.pos_from_line_and_col(line_num+1, 0) except IndexError: end_pos = len(self.accessor.text) yield type, self.accessor.text_range(start_pos, end_pos) class FindHit(Hit, _HitWithAccessorMixin): def __init__(self, path, textInfo, match, accessor): self.path = path self.textInfo = textInfo self.match = match self.accessor = accessor def __repr__(self): hit_summary = repr(self.match.group(0)) if len(hit_summary) > 20: hit_summary = hit_summary[:16] + "...'" return ""\ % (hit_summary, self.path, self.line_num_range[0]+1) @property def encoding(self): # bug 93985 -- this method currently not used, but available for # any future use. Arguably we should drop this, and just # return encoding_with_bom return self.textInfo.encoding @property def encoding_with_bom(self): _encoding = self.textInfo.encoding # bug 93985 -- make sure we preserve the BOM for utf-8 files with BOM if _encoding == "utf-8" and self.textInfo.has_bom: return "utf-8-sig" else: return _encoding @property def is_loaded_path(self): return self.textInfo.is_loaded_path @property def start_pos(self): return self.match.start() @property def end_pos(self): return self.match.end() class ReplaceHit(Hit, _HitWithAccessorMixin): def __init__(self, path, before, after, start_pos, end_pos, start_line, find_hit=None, accessor=None): self.path = path self.before = before self.after = after self.start_pos = start_pos self.end_pos = end_pos self.start_line = start_line # These will be None for ReplaceHits restore from a journal. self.find_hit = find_hit self.accessor = accessor def cull_mem(self): """Can be called with done with the line/col calculation-related methods on this object. It will release some of the larger memory consumption for this object. This can be useful to keep memory usage low for a large replace operation over many many files. """ del self.find_hit del self.accessor def __getstate__(self): """Don't pickle some attrs.""" d = self.__dict__.copy() try: del d["find_hit"] except KeyError: pass try: del d["accessor"] except KeyError: pass return d def __repr__(self): before_summary = repr(self.before) if len(before_summary) > 20: before_summary = before_summary[:16] + "...'" after_summary = repr(self.after) if len(after_summary) > 20: after_summary = after_summary[:16] + "...'" return " %s at %s#%s>" \ % (before_summary, after_summary, self.path, self.start_line+1) class ChangeHit(object): def __init__(self, findHit, replaceHit): self.start_pos = findHit.start_pos self.end_pos = findHit.end_pos self.start_line = replaceHit.start_line self.before = replaceHit.before self.after = replaceHit.after class ReplaceHitGroup(Hit): """A group of ReplaceHit's for a single path. This class knows how to do the actual replacements on disk and to log to the replacement journal. """ def __init__(self, regex, repl, path, encoding, before_text, fhits, after_text, journal): self.regex = regex self.repl = repl self.path = abspath(path) # Need abspath for journal for undo. self.encoding = encoding self.before_text = before_text self.fhits = fhits self.after_text = after_text self.journal = journal #self._calculated_rhits = False # whether lazy calculations have been done def __repr__(self): return "" \ % (self.nicepath, self.length) @property def nicepath(self): r = _relpath(self.path) a = self.path if not sys.platform == "win32": home = os.environ["HOME"] if a.startswith(home + os.sep): a = "~" + a[len(home):] if len(r) < len(a): return r else: return a @property def length(self): return len(self.fhits) _diff_cache = None @property def diff(self): if self._diff_cache is None: diff_lines = ["Index: %s\n" % self.path] diff_lines += difflibex.unified_diff( self.before_text.splitlines(1), self.after_text.splitlines(1), "%s (before)" % self.path, "%s (after)" % self.path) self._diff_cache = ''.join(diff_lines) return self._diff_cache def commit(self, loadedFileManager=None): """Log changes to the journal and make the replacements.""" # Gather and log (to the journal) the data needed for undo: # - The file encoding (encoding). # - The md5sum of the file before replacements # (before_md5sum), allows for a sanity check if there # weren't any subsequent changes. # - The md5sum of the file after replacements # (after_md5sum), so we can know if we might have # difficulties undoing later because of subsequent # changes. # - For each hit: the string replaced (before), the # replacement string (after), the char pos range # (start_pos, end_pos), the start line (start_line, # 0-based). #TODO: get before_md5sum from the TextInfo before_md5sum = md5( self.before_text.encode(self.encoding)).hexdigest() after_md5sum = md5( self.after_text.encode(self.encoding)).hexdigest() self.rhits = [] accessor = _TextAccessor(self.after_text) pos_offset = 0 line_offset = 0 for fhit in self.fhits: m = fhit.match before = m.group(0) #TODO: Need to, at least, do EOL-normalization on 'after'. after = m.expand(self.repl) start_pos = fhit.start_pos + pos_offset pos_offset += len(after) - len(before) end_pos = fhit.end_pos + pos_offset start_line = fhit.line_num_range[0] + line_offset line_offset += after.count('\n') - before.count('\n') rhit = ReplaceHit(self.path, before, after, start_pos, end_pos, start_line, fhit, accessor) self.rhits.append(rhit) # Apply the changes. #TODO: catch any exception and restore file (or # create a backup somewhere) if write fails if not self.fhits[0].is_loaded_path: self.journal.add_replace_group(self.path, self.encoding, before_md5sum, after_md5sum, self.rhits) f = codecs.open(self.path, 'wb', self.encoding) try: f.write(self.after_text) finally: f.close() else: if not loadedFileManager: raise FindError("Can't apply changes to loaded dirty file %s", self.path) changeHits = [] for fhit, rhit in zip(self.fhits, self.rhits): changeHit = ChangeHit(fhit, rhit) changeHits.append(changeHit) self.journal.add_replace_loaded_file_group(self.path, self.encoding, before_md5sum, after_md5sum, changeHits) loadedFileManager.applyChanges(self.path, self.after_text, changeHits) #TODO: Figure out how this works with the other data bits. # E.g., do we calc the diff before caching? #def cache(self): # """Can be called to have memory-heavy state be stored to a cache # area and dropped. # """ #---- replace journaling stuff class JournalReplaceRecord(Event): """A record of a replacements to a single path in a Journal. These are pickled in a Journal and are used by `undo_replace()`. """ def __init__(self, path, encoding, before_md5sum, after_md5sum, rhits): self.path = abspath(path) # Need abspath for journal for undo. self.encoding = encoding self.before_md5sum = before_md5sum # MD5 of path before replacements self.after_md5sum = after_md5sum # MD5 of path after replacements self.rhits = rhits def __repr__(self): return "" \ % (self.nicepath, len(self.rhits)) @property def length(self): return len(self.rhits) @property def nicepath(self): r = _relpath(self.path) a = self.path if not sys.platform == "win32": home = os.environ["HOME"] if a.startswith(home + os.sep): a = "~" + a[len(home):] if len(r) < len(a): return r else: return a class JournalReplaceLoadedBufferRecord(JournalReplaceRecord): pass class Journal(list): """A replacement journal. >> j = Journal.create("s/foo/bar/ in *.py") # create a new journal >> j.add_replace_group(...) # add replacements for a path >> j.close() # save (as a pickle) and close the file The created journal file is a pickle file in the journal dir (as returned by the _get_journal_dir() static method). The path is stored in `j.path`. >> k = Journal.load(j.path) # open and load the saved journal >> k[0] ...hits for 'path'... There is also a journal id (it is part of the file path) that can be used to open. >> m = Journal.load(j.id) >> m[0] ...hits for 'path'... """ id = None path = None summary_path = None summary = None # prose description of the replacement _journal_dir = None @classmethod def set_journal_dir(cls, dir): """Call this to configure where the Journal class stores journals. """ cls._journal_dir = dir @classmethod def get_journal_dir(cls): if cls._journal_dir is None: try: import applib except ImportError: d = expanduser("~/.findlib2") else: d = applib.user_cache_dir("findlib2", "ActiveState") cls._journal_dir = d return cls._journal_dir @staticmethod def get_new_id(journal_dir): existing_ids = set(p[8:-7] for p in os.listdir(journal_dir) if p.startswith("journal-") and p.endswith(".pickle")) SENTINEL = 100 for i in range(SENTINEL): id = _get_friendly_id() if id not in existing_ids: return id else: raise FindError("could not find a unique journal id in " "%d attempts" % SENTINEL) NUM_JOURNALS_TO_KEEP = 5 @classmethod def remove_old_journals(cls, journal_dir): """By default creating a replacement journal will remove all but the last {NUM_JOURNALS_TO_KEEP} journals. Otherwise space consumption could get huge after long usage. """ try: prefix_len = len(join(journal_dir, "journal-")) suffix_len = len(".pickle") mtimes_and_journal_ids = [ (os.stat(p).st_mtime, p[prefix_len:-suffix_len]) for p in glob(join(journal_dir, "journal-*.pickle")) ] mtimes_and_journal_ids.sort() for mtime, id in mtimes_and_journal_ids[:-cls.NUM_JOURNALS_TO_KEEP]: jpath = join(journal_dir, "journal-%s.pickle" % (id,)) log.debug("rm old journal %s", jpath) try: os.remove(jpath) except: log.error("Can't remove journal %s", jpath) spath = join(journal_dir, "summary-%s.txt" % (id,)) log.debug("rm old summary %s", spath) try: os.remove(spath) except: log.error("Can't remove remove %s", spath) except EnvironmentError, ex: log.warn("error removing old journals: %s (ignored)", ex) @classmethod def journals(cls): """Generate the list of current journals. Yields tuples of the form (most recent first): (, ,

) """ journal_dir = cls.get_journal_dir() mtimes_and_paths = [ (os.stat(p).st_mtime, p) for p in glob(join(journal_dir, "journal-*.pickle")) ] mtimes_and_paths.sort(reverse=True) for mtime, path in mtimes_and_paths: id = basename(path)[8:-7] # len("journal") == 8, ... summary_path = join(dirname(path), "summary-%s.txt" % id) try: f = codecs.open(summary_path, 'r', "utf-8") summary = f.read() f.close() except EnvironmentError: summary = "(no summary)" yield mtime, id, summary @classmethod def create(cls, summary): journal_dir = cls.get_journal_dir() if not exists(journal_dir): os.makedirs(journal_dir) else: cls.remove_old_journals(journal_dir) id = cls.get_new_id(journal_dir) path = join(journal_dir, "journal-%s.pickle" % id) summary_path = join(journal_dir, "summary-%s.txt" % id) f = codecs.open(summary_path, 'w', "utf-8") try: f.write(summary) finally: f.close() file = open(path, 'wb') instance = cls(id, path, file) instance.summary_path = summary_path instance.summary = summary return instance @classmethod def load(cls, path_or_id): if os.sep in path_or_id or os.altsep and os.altsep in path_or_id: path = path_or_id id = basename(path_or_id)[8:-7] else: id = path_or_id journal_dir = cls.get_journal_dir() path = join(journal_dir, "journal-%s.pickle" % id) summary_path = join(journal_dir, "summary-%s.txt" % id) f = codecs.open(summary_path, 'r', "utf-8") try: summary = f.read() finally: f.close() file = open(path, 'rb') instance = cls(id, path, file) instance.summary_path = summary_path instance.summary = summary try: instance += pickle.load(file) finally: file.close() return instance def __init__(self, id, path, file): list.__init__(self) self.id = id self.path = path self.file = file def __del__(self): self.close() def add_replace_group(self, path, encoding, before_md5sum, after_md5sum, rhits): #TODO: Do I need eol-style here as well? Yes, I think so. Add a # test case for this! record = JournalReplaceRecord(path, encoding, before_md5sum, after_md5sum, rhits) self.append(record) def add_replace_loaded_file_group(self, path, encoding, before_md5sum, after_md5sum, changeHits): #TODO: Do I need eol-style here as well? Yes, I think so. Add a # test case for this! record = JournalReplaceLoadedBufferRecord(path, encoding, before_md5sum, after_md5sum, changeHits) self.append(record) def close(self): if not self.file.closed: if len(self): # Dump data to pickle. pickle.dump(list(self), self.file) self.file.close() else: self.file.close() _rm(self.path) _rm(self.summary_path) #---- more generic utils def _rm(path, log=None): """Forcefully and silently remove the given file, if can.""" if log: log("rm `%s'", path) if not exists(path): return try: os.remove(path) except EnvironmentError: pass # Recipe: regex_from_str (2.2.1) def str_from_regex_info(regex, repl=None): r"""Generate a string representing the regex (and optional replacement). >>> str_from_regex_info(re.compile('foo')) '/foo/' >>> str_from_regex_info(re.compile('foo', re.I)) '/foo/i' >>> str_from_regex_info(re.compile('foo'), 'bar') 's/foo/bar/' >>> str_from_regex_info(re.compile('foo/bar', re.M), 'bar') 's/foo\\/bar/bar/m' Note: this is intended to round-trip with regex_info_from_str(). """ ch_from_flag = { re.IGNORECASE: "i", re.LOCALE: "l", re.DOTALL: "s", re.MULTILINE: "m", re.UNICODE: "u", } def _str_from_flags(flags): if not flags: return "" chars = [] for flag, ch in ch_from_flag.items(): if flags & flag: chars.append(ch) chars.sort() return ''.join(chars) def _escape(s): return s.replace('/', r'\/') pattern = _escape(regex.pattern) flags_str = _str_from_flags(regex.flags) if repl is None: s = "/%s/%s" % (pattern, flags_str) else: s = "s/%s/%s/%s" % (pattern, _escape(repl), flags_str) return s def regex_info_from_str(s, allow_replace=True, word_match=False, universal_newlines=False): r"""Interpret a regex match or substitution string. >>> regex_info_from_str("foo") \ ... == (re.compile('foo'), None) True >>> regex_info_from_str("/foo/") \ ... == (re.compile('foo'), None) True >>> regex_info_from_str("/foo/i") \ ... == (re.compile('foo', re.I), None) True >>> regex_info_from_str("s/foo/bar/i") \ ... == (re.compile('foo', re.I), "bar") True The `word_match` boolean modifies the pattern to match at word boundaries. >>> regex_info_from_str("/foo/", word_match=True) \ ... == (re.compile(r'(?>> regex_info_from_str("/foo$/", universal_newlines=True) \ ... == (re.compile(r'foo(?=\r\n|(?>> regex_info_from_str(r"/foo\$/", universal_newlines=True) \ ... == (re.compile(r'foo\$'), None) True >>> regex_info_from_str(r"/foo[$]/", universal_newlines=True) \ ... == (re.compile(r'foo[$]'), None) True Note: this is intended to round-trip with str_from_regex_info(). """ flag_from_ch = { "i": re.IGNORECASE, "l": re.LOCALE, "s": re.DOTALL, "m": re.MULTILINE, "u": re.UNICODE, } def _flags_from_str(flags_str): flags = 0 for ch in flags_str: try: flags |= flag_from_ch[ch] except KeyError: raise ValueError("unsupported regex flag: '%s' in '%s' " "(must be one of '%s')" % (ch, flags_str, ''.join(flag_from_ch.keys()))) return flags plain = False if s.startswith('/') and s.rfind('/') != 0: # Parse it: /PATTERN/FLAGS idx = s.rfind('/') pattern, flags_str = s[1:idx], s[idx+1:] flags = _flags_from_str(flags_str) repl = None elif allow_replace and s.startswith("s/") and s.count('/') >= 3: # Parse it: s/PATTERN/REPL/FLAGS #TODO: test with '\' in pattern and repl repl_re = re.compile(r"^s/(.*?)(?, ...]) -> file paths Generate a list of paths (files and/or dirs) represented by the given path patterns. "path_patterns" is a list of paths optionally using the '*', '?' and '[seq]' glob patterns. "files" is boolean (default True) indicating if file paths should be yielded "dirs" is string indicating under what conditions dirs are yielded. It must be one of: never (default) never yield dirs always yield all dirs matching given patterns if-not-recursive only yield dirs for invocations when recursive=False See use cases below for more details. "recursive" is boolean (default True) indicating if paths should be recursively yielded under given dirs. "includes" is a list of file patterns to include in recursive searches. "excludes" is a list of file and dir patterns to exclude. (Note: This is slightly different than GNU grep's --exclude option which only excludes *files*. I.e. you cannot exclude a ".svn" dir.) "skip_dupe_dirs" can be set True to watch for and skip descending into a dir that has already been yielded. "follow_symlinks" is a boolean indicating whether to follow symlinks (default False). Use "skip_dupe_dirs" to guard against infinite loops with circular dir symlinks. "yield_structure" whether to yield entries the os.walk way (rather than one path at a time). "on_error" is an error callback called when a given path pattern matches nothing: on_error(PATH_PATTERN) If not specified, the default is look for a "log" global and call: log.error("`%s': No such file or directory") Specify None to do nothing. Typically this is useful for a command-line tool that takes a list of paths as arguments. (For Unix-heads: the shell on Windows does NOT expand glob chars, that is left to the app.) Use case #1: like `grep -r` {files=True, dirs='never', recursive=(if '-r' in opts)} script FILE # yield FILE, else call on_error(FILE) script DIR # yield nothing script PATH* # yield all files matching PATH*; if none, # call on_error(PATH*) callback script -r DIR # yield files (not dirs) recursively under DIR script -r PATH* # yield files matching PATH* and files recursively # under dirs matching PATH*; if none, call # on_error(PATH*) callback Use case #2: like `file -r` (if it had a recursive option) {files=True, dirs='if-not-recursive', recursive=(if '-r' in opts)} script FILE # yield FILE, else call on_error(FILE) script DIR # yield DIR, else call on_error(DIR) script PATH* # yield all files and dirs matching PATH*; if none, # call on_error(PATH*) callback script -r DIR # yield files (not dirs) recursively under DIR script -r PATH* # yield files matching PATH* and files recursively # under dirs matching PATH*; if none, call # on_error(PATH*) callback Use case #3: kind of like `find .` {files=True, dirs='always', recursive=(if '-r' in opts)} script FILE # yield FILE, else call on_error(FILE) script DIR # yield DIR, else call on_error(DIR) script PATH* # yield all files and dirs matching PATH*; if none, # call on_error(PATH*) callback script -r DIR # yield files and dirs recursively under DIR # (including DIR) script -r PATH* # yield files and dirs matching PATH* and recursively # under dirs; if none, call on_error(PATH*) # callback TODO: perf improvements (profile, stat just once) """ assert not isinstance(path_patterns, basestring), \ "'path_patterns' must be a sequence, not a string: %r" % path_patterns GLOB_CHARS = '*?[' if skip_dupe_dirs: searched_dirs = set() for path_pattern in path_patterns: # Determine the set of paths matching this path_pattern. for glob_char in GLOB_CHARS: if glob_char in path_pattern: paths = glob(path_pattern) break else: if follow_symlinks: paths = exists(path_pattern) and [path_pattern] or [] else: paths = lexists(path_pattern) and [path_pattern] or [] if not paths: if on_error is None: pass elif on_error is _NOT_SPECIFIED: try: log.error("`%s': No such file or directory", path_pattern) except (NameError, AttributeError): pass else: on_error(path_pattern) for path in paths: if (follow_symlinks or not islink(path)) and isdir(path): if skip_dupe_dirs: canon_path = normpath(abspath(path)) if follow_symlinks: canon_path = realpath(canon_path) if canon_path in searched_dirs: continue else: searched_dirs.add(canon_path) # 'includes' SHOULD affect whether a dir is yielded. if (dirs == "always" or (dirs == "if-not-recursive" and not recursive) ) and _should_include_path(path, includes, excludes): if not yield_structure: yield path # However, if recursive, 'includes' should NOT affect # whether a dir is recursed into. Otherwise you could # not: # script -r --include="*.py" DIR if recursive and _should_include_path(path, [], excludes): for dirpath, dirnames, filenames in os_walk(path, followlinks=follow_symlinks): _filenames = [] _dirnames = [] dir_indeces_to_remove = [] for i, dirname in enumerate(dirnames): d = join(dirpath, dirname) if skip_dupe_dirs: canon_d = normpath(abspath(d)) if follow_symlinks: canon_d = realpath(canon_d) if canon_d in searched_dirs: dir_indeces_to_remove.append(i) continue else: searched_dirs.add(canon_d) if dirs == "always" \ and _should_include_path(d, includes, excludes): if yield_structure: _dirnames.append(dirname) else: yield d if not _should_include_path(d, [], excludes): dir_indeces_to_remove.append(i) for i in reversed(dir_indeces_to_remove): del dirnames[i] if files: for filename in sorted(filenames): f = join(dirpath, filename) if _should_include_path(f, includes, excludes): if yield_structure: _filenames.append(filename) else: yield f if yield_structure: yield dirpath, _dirnames, _filenames elif not recursive: _filenames = [] _dirnames = [] for filename in os.listdir(path): try: filepath = os.path.join(path,filename) if _should_include_path(filepath, includes, excludes): filetype = "dir" if os.path.isdir(filepath) else "file" if yield_structure: if filetype is "dir": _dirnames.append(filename) else: _filenames.append(filename) else: yield filepath except UnicodeDecodeError, ex: # This may happen with remote filesystems whose # encoding does not match the current system's. log.exception("Unable to search in `%s/%s' due to UTF-8 decoding error", path.decode('utf-8', 'replace'), filename.decode('utf-8', 'replace')) if yield_structure: yield path, _dirnames, _filenames elif files and _should_include_path(path, includes, excludes): if not yield_structure: yield path #---- internal accessor stuff class _TextAccessor(object): """An API for accessing some text (allowing for a nicer API and caching). This is based on codeintel's accessor classes, but drops the lexing info and is otherwise simplified. """ def __init__(self, text): self.text = text def line_and_col_from_pos(self, pos): #TODO: Fix this. This is busted for line 0 (at least). Really? line = self.line_from_pos(pos) col = pos - self.__start_pos_from_line[line] return line, col __start_pos_from_line = None def line_from_pos(self, pos): r""" >>> sa = _TextAccessor( ... #0 1 2 3 ... #01234567890 123456789 01234567890 12345 ... 'import sys\nif True:\nprint "hi"\n# bye') >>> sa.line_from_pos(0) 0 >>> sa.line_from_pos(9) 0 >>> sa.line_from_pos(10) 0 >>> sa.line_from_pos(11) 1 >>> sa.line_from_pos(22) 2 >>> sa.line_from_pos(34) 3 >>> sa.line_from_pos(35) 3 """ # Build the line -> start-pos info. if self.__start_pos_from_line is None: self.__start_pos_from_line = [0] for line_str in self.text.splitlines(True): self.__start_pos_from_line.append( self.__start_pos_from_line[-1] + len(line_str)) # Binary search for line number. lower, upper = 0, len(self.__start_pos_from_line) sentinel = 25 while sentinel > 0: line = ((upper - lower) / 2) + lower #print "LINE %d: limits=(%d, %d) start-pos=%d (sentinel %d)"\ # % (line, lower, upper, self.__start_pos_from_line[line], sentinel) if pos < self.__start_pos_from_line[line]: upper = line elif line+1 == upper or self.__start_pos_from_line[line+1] > pos: return line else: lower = line sentinel -= 1 else: raise FindError("line_from_pos binary search sentinel hit: there " "is likely a logic problem here!") def pos_from_line_and_col(self, line, col): if not self.__start_pos_from_line: self.line_from_pos(len(self.text)) # force init return self.__start_pos_from_line[line] + col def text_range(self, start, end): return self.text[start:end] #---- internal support stuff # Recipe: relpath (0.2) def _relpath(path, relto=None): """Relativize the given path to another (relto). "relto" indicates a directory to which to make "path" relative. It default to the cwd if not specified. """ if not os.path.isabs(path): path = os.path.abspath(path) if relto is None: relto = os.getcwd() else: relto = os.path.abspath(relto) if sys.platform.startswith("win"): def _equal(a, b): return a.lower() == b.lower() else: def _equal(a, b): return a == b pathDrive, pathRemainder = os.path.splitdrive(path) if not pathDrive: pathDrive = os.path.splitdrive(os.getcwd())[0] relToDrive, relToRemainder = os.path.splitdrive(relto) if not _equal(pathDrive, relToDrive): # Which is better: raise an exception or return ""? return "" #raise OSError("Cannot make '%s' relative to '%s'. They are on "\ # "different drives." % (path, relto)) pathParts = _splitall(pathRemainder)[1:] # drop the leading root dir relToParts = _splitall(relToRemainder)[1:] # drop the leading root dir #print "_relpath: pathPaths=%s" % pathParts #print "_relpath: relToPaths=%s" % relToParts for pathPart, relToPart in zip(pathParts, relToParts): if _equal(pathPart, relToPart): # drop the leading common dirs del pathParts[0] del relToParts[0] #print "_relpath: pathParts=%s" % pathParts #print "_relpath: relToParts=%s" % relToParts # Relative path: walk up from "relto" dir and walk down "path". relParts = [os.curdir] + [os.pardir]*len(relToParts) + pathParts #print "_relpath: relParts=%s" % relParts relPath = os.path.normpath( os.path.join(*relParts) ) return relPath # Recipe: splitall (0.2) def _splitall(path): r"""Split the given path into all constituent parts. Often, it's useful to process parts of paths more generically than os.path.split(), for example if you want to walk up a directory. This recipe splits a path into each piece which corresponds to a mount point, directory name, or file. (From the Python Cookbook, Files section, Recipe 99.) """ allparts = [] while 1: parts = os.path.split(path) if parts[0] == path: # sentinel for absolute paths allparts.insert(0, parts[0]) break elif parts[1] == path: # sentinel for relative paths allparts.insert(0, parts[1]) break else: path = parts[0] allparts.insert(0, parts[1]) allparts = [p for p in allparts if p] # drop empty strings return allparts #TODO: Make a personal recipe of this. def _get_friendly_id(): """Create an ID string we can recognise. (Think Italian or Japanese or Native American.) from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/526619 """ from random import choice v = 'aeiou' c = 'bdfghklmnprstvw' return ''.join([choice(v if i%2 else c) for i in range(8)]) #---- If we have smart case-sensitivity, and the search-string is all # lower-case, preserve all-caps and initial-caps matches def do_smart_conversion(foundText, replText): """ If found is all lower-case, return replText as is If FOUND is all upper-case, return replText.upper() If Found is capitalized and return replText.capitalize() """ if foundText.lower() == foundText: return replText # as is if foundText.upper() == foundText: return replText.upper() if foundText[0].isupper() and replText[0].islower(): # Capitalize the first letter, leave the rest as is return replText[0].upper() + replText[1:] # There are no other templates that make sense. return replText #---- self-test mainline def _test(): r"""Some extra doctests: Bug 80881: >>> regex_info_from_str("/[w-z]$/", universal_newlines=True) \ ... == (re.compile(r'[w-z](?=\r\n|(?