1"""distutils.filelist 2 3Provides the FileList class, used for poking about the filesystem 4and building lists of files. 5""" 6 7import os 8import re 9import fnmatch 10import functools 11 12from distutils.util import convert_path 13from distutils.errors import DistutilsTemplateError, DistutilsInternalError 14from distutils import log 15 16 17class FileList: 18 """A list of files built by on exploring the filesystem and filtered by 19 applying various patterns to what we find there. 20 21 Instance attributes: 22 dir 23 directory from which files will be taken -- only used if 24 'allfiles' not supplied to constructor 25 files 26 list of filenames currently being built/filtered/manipulated 27 allfiles 28 complete list of files under consideration (ie. without any 29 filtering applied) 30 """ 31 32 def __init__(self, warn=None, debug_print=None): 33 # ignore argument to FileList, but keep them for backwards 34 # compatibility 35 self.allfiles = None 36 self.files = [] 37 38 def set_allfiles(self, allfiles): 39 self.allfiles = allfiles 40 41 def findall(self, dir=os.curdir): 42 self.allfiles = findall(dir) 43 44 def debug_print(self, msg): 45 """Print 'msg' to stdout if the global DEBUG (taken from the 46 DISTUTILS_DEBUG environment variable) flag is true. 47 """ 48 from distutils.debug import DEBUG 49 if DEBUG: 50 print(msg) 51 52 # Collection methods 53 54 def append(self, item): 55 self.files.append(item) 56 57 def extend(self, items): 58 self.files.extend(items) 59 60 def sort(self): 61 # Not a strict lexical sort! 62 sortable_files = sorted(map(os.path.split, self.files)) 63 self.files = [] 64 for sort_tuple in sortable_files: 65 self.files.append(os.path.join(*sort_tuple)) 66 67 # Other miscellaneous utility methods 68 69 def remove_duplicates(self): 70 # Assumes list has been sorted! 71 for i in range(len(self.files) - 1, 0, -1): 72 if self.files[i] == self.files[i - 1]: 73 del self.files[i] 74 75 # "File template" methods 76 77 def _parse_template_line(self, line): 78 words = line.split() 79 action = words[0] 80 81 patterns = dir = dir_pattern = None 82 83 if action in ('include', 'exclude', 84 'global-include', 'global-exclude'): 85 if len(words) < 2: 86 raise DistutilsTemplateError( 87 "'%s' expects <pattern1> <pattern2> ..." % action) 88 patterns = [convert_path(w) for w in words[1:]] 89 elif action in ('recursive-include', 'recursive-exclude'): 90 if len(words) < 3: 91 raise DistutilsTemplateError( 92 "'%s' expects <dir> <pattern1> <pattern2> ..." % action) 93 dir = convert_path(words[1]) 94 patterns = [convert_path(w) for w in words[2:]] 95 elif action in ('graft', 'prune'): 96 if len(words) != 2: 97 raise DistutilsTemplateError( 98 "'%s' expects a single <dir_pattern>" % action) 99 dir_pattern = convert_path(words[1]) 100 else: 101 raise DistutilsTemplateError("unknown action '%s'" % action) 102 103 return (action, patterns, dir, dir_pattern) 104 105 def process_template_line(self, line): 106 # Parse the line: split it up, make sure the right number of words 107 # is there, and return the relevant words. 'action' is always 108 # defined: it's the first word of the line. Which of the other 109 # three are defined depends on the action; it'll be either 110 # patterns, (dir and patterns), or (dir_pattern). 111 (action, patterns, dir, dir_pattern) = self._parse_template_line(line) 112 113 # OK, now we know that the action is valid and we have the 114 # right number of words on the line for that action -- so we 115 # can proceed with minimal error-checking. 116 if action == 'include': 117 self.debug_print("include " + ' '.join(patterns)) 118 for pattern in patterns: 119 if not self.include_pattern(pattern, anchor=1): 120 log.warn("warning: no files found matching '%s'", 121 pattern) 122 123 elif action == 'exclude': 124 self.debug_print("exclude " + ' '.join(patterns)) 125 for pattern in patterns: 126 if not self.exclude_pattern(pattern, anchor=1): 127 log.warn(("warning: no previously-included files " 128 "found matching '%s'"), pattern) 129 130 elif action == 'global-include': 131 self.debug_print("global-include " + ' '.join(patterns)) 132 for pattern in patterns: 133 if not self.include_pattern(pattern, anchor=0): 134 log.warn(("warning: no files found matching '%s' " 135 "anywhere in distribution"), pattern) 136 137 elif action == 'global-exclude': 138 self.debug_print("global-exclude " + ' '.join(patterns)) 139 for pattern in patterns: 140 if not self.exclude_pattern(pattern, anchor=0): 141 log.warn(("warning: no previously-included files matching " 142 "'%s' found anywhere in distribution"), 143 pattern) 144 145 elif action == 'recursive-include': 146 self.debug_print("recursive-include %s %s" % 147 (dir, ' '.join(patterns))) 148 for pattern in patterns: 149 if not self.include_pattern(pattern, prefix=dir): 150 msg = ( 151 "warning: no files found matching '%s' " 152 "under directory '%s'" 153 ) 154 log.warn(msg, pattern, dir) 155 156 elif action == 'recursive-exclude': 157 self.debug_print("recursive-exclude %s %s" % 158 (dir, ' '.join(patterns))) 159 for pattern in patterns: 160 if not self.exclude_pattern(pattern, prefix=dir): 161 log.warn(("warning: no previously-included files matching " 162 "'%s' found under directory '%s'"), 163 pattern, dir) 164 165 elif action == 'graft': 166 self.debug_print("graft " + dir_pattern) 167 if not self.include_pattern(None, prefix=dir_pattern): 168 log.warn("warning: no directories found matching '%s'", 169 dir_pattern) 170 171 elif action == 'prune': 172 self.debug_print("prune " + dir_pattern) 173 if not self.exclude_pattern(None, prefix=dir_pattern): 174 log.warn(("no previously-included directories found " 175 "matching '%s'"), dir_pattern) 176 else: 177 raise DistutilsInternalError( 178 "this cannot happen: invalid action '%s'" % action) 179 180 # Filtering/selection methods 181 182 def include_pattern(self, pattern, anchor=1, prefix=None, is_regex=0): 183 """Select strings (presumably filenames) from 'self.files' that 184 match 'pattern', a Unix-style wildcard (glob) pattern. Patterns 185 are not quite the same as implemented by the 'fnmatch' module: '*' 186 and '?' match non-special characters, where "special" is platform- 187 dependent: slash on Unix; colon, slash, and backslash on 188 DOS/Windows; and colon on Mac OS. 189 190 If 'anchor' is true (the default), then the pattern match is more 191 stringent: "*.py" will match "foo.py" but not "foo/bar.py". If 192 'anchor' is false, both of these will match. 193 194 If 'prefix' is supplied, then only filenames starting with 'prefix' 195 (itself a pattern) and ending with 'pattern', with anything in between 196 them, will match. 'anchor' is ignored in this case. 197 198 If 'is_regex' is true, 'anchor' and 'prefix' are ignored, and 199 'pattern' is assumed to be either a string containing a regex or a 200 regex object -- no translation is done, the regex is just compiled 201 and used as-is. 202 203 Selected strings will be added to self.files. 204 205 Return True if files are found, False otherwise. 206 """ 207 # XXX docstring lying about what the special chars are? 208 files_found = False 209 pattern_re = translate_pattern(pattern, anchor, prefix, is_regex) 210 self.debug_print("include_pattern: applying regex r'%s'" % 211 pattern_re.pattern) 212 213 # delayed loading of allfiles list 214 if self.allfiles is None: 215 self.findall() 216 217 for name in self.allfiles: 218 if pattern_re.search(name): 219 self.debug_print(" adding " + name) 220 self.files.append(name) 221 files_found = True 222 return files_found 223 224 def exclude_pattern( 225 self, pattern, anchor=1, prefix=None, is_regex=0): 226 """Remove strings (presumably filenames) from 'files' that match 227 'pattern'. Other parameters are the same as for 228 'include_pattern()', above. 229 The list 'self.files' is modified in place. 230 Return True if files are found, False otherwise. 231 """ 232 files_found = False 233 pattern_re = translate_pattern(pattern, anchor, prefix, is_regex) 234 self.debug_print("exclude_pattern: applying regex r'%s'" % 235 pattern_re.pattern) 236 for i in range(len(self.files)-1, -1, -1): 237 if pattern_re.search(self.files[i]): 238 self.debug_print(" removing " + self.files[i]) 239 del self.files[i] 240 files_found = True 241 return files_found 242 243 244# Utility functions 245 246def _find_all_simple(path): 247 """ 248 Find all files under 'path' 249 """ 250 all_unique = _UniqueDirs.filter(os.walk(path, followlinks=True)) 251 results = ( 252 os.path.join(base, file) 253 for base, dirs, files in all_unique 254 for file in files 255 ) 256 return filter(os.path.isfile, results) 257 258 259class _UniqueDirs(set): 260 """ 261 Exclude previously-seen dirs from walk results, 262 avoiding infinite recursion. 263 Ref https://bugs.python.org/issue44497. 264 """ 265 def __call__(self, walk_item): 266 """ 267 Given an item from an os.walk result, determine 268 if the item represents a unique dir for this instance 269 and if not, prevent further traversal. 270 """ 271 base, dirs, files = walk_item 272 stat = os.stat(base) 273 candidate = stat.st_dev, stat.st_ino 274 found = candidate in self 275 if found: 276 del dirs[:] 277 self.add(candidate) 278 return not found 279 280 @classmethod 281 def filter(cls, items): 282 return filter(cls(), items) 283 284 285def findall(dir=os.curdir): 286 """ 287 Find all files under 'dir' and return the list of full filenames. 288 Unless dir is '.', return full filenames with dir prepended. 289 """ 290 files = _find_all_simple(dir) 291 if dir == os.curdir: 292 make_rel = functools.partial(os.path.relpath, start=dir) 293 files = map(make_rel, files) 294 return list(files) 295 296 297def glob_to_re(pattern): 298 """Translate a shell-like glob pattern to a regular expression; return 299 a string containing the regex. Differs from 'fnmatch.translate()' in 300 that '*' does not match "special characters" (which are 301 platform-specific). 302 """ 303 pattern_re = fnmatch.translate(pattern) 304 305 # '?' and '*' in the glob pattern become '.' and '.*' in the RE, which 306 # IMHO is wrong -- '?' and '*' aren't supposed to match slash in Unix, 307 # and by extension they shouldn't match such "special characters" under 308 # any OS. So change all non-escaped dots in the RE to match any 309 # character except the special characters (currently: just os.sep). 310 sep = os.sep 311 if os.sep == '\\': 312 # we're using a regex to manipulate a regex, so we need 313 # to escape the backslash twice 314 sep = r'\\\\' 315 escaped = r'\1[^%s]' % sep 316 pattern_re = re.sub(r'((?<!\\)(\\\\)*)\.', escaped, pattern_re) 317 return pattern_re 318 319 320def translate_pattern(pattern, anchor=1, prefix=None, is_regex=0): 321 """Translate a shell-like wildcard pattern to a compiled regular 322 expression. Return the compiled regex. If 'is_regex' true, 323 then 'pattern' is directly compiled to a regex (if it's a string) 324 or just returned as-is (assumes it's a regex object). 325 """ 326 if is_regex: 327 if isinstance(pattern, str): 328 return re.compile(pattern) 329 else: 330 return pattern 331 332 # ditch start and end characters 333 start, _, end = glob_to_re('_').partition('_') 334 335 if pattern: 336 pattern_re = glob_to_re(pattern) 337 assert pattern_re.startswith(start) and pattern_re.endswith(end) 338 else: 339 pattern_re = '' 340 341 if prefix is not None: 342 prefix_re = glob_to_re(prefix) 343 assert prefix_re.startswith(start) and prefix_re.endswith(end) 344 prefix_re = prefix_re[len(start): len(prefix_re) - len(end)] 345 sep = os.sep 346 if os.sep == '\\': 347 sep = r'\\' 348 pattern_re = pattern_re[len(start): len(pattern_re) - len(end)] 349 pattern_re = r'%s\A%s%s.*%s%s' % ( 350 start, prefix_re, sep, pattern_re, end) 351 else: # no prefix -- respect anchor flag 352 if anchor: 353 pattern_re = r'%s\A%s' % (start, pattern_re[len(start):]) 354 355 return re.compile(pattern_re) 356