tutorial-setting-up-environment/bin/lesson_check.py

0001 """
0002 Check lesson files and their contents.
0003 """
0004
0005
0006 import os
0007 import glob
0008 import re
0009 import sys
0010 from argparse import ArgumentParser
0011
0012 # This uses the `__all__` list in `util.py` to determine what objects to import
0013 # see https://docs.python.org/3/tutorial/modules.html#importing-from-a-package
0014 from util import *
0015 from reporter import Reporter
0016
0017 __version__ = '0.3'
0018
0019 # Where to look for source Markdown files.
0020 SOURCE_DIRS = ['', '_episodes', '_extras']
0021
0022 # Where to look for source Rmd files.
0023 SOURCE_RMD_DIRS = ['_episodes_rmd']
0024
0025 # Required files: each entry is ('path': YAML_required).
0026 # FIXME: We do not yet validate whether any files have the required
0027 #   YAML headers, but should in the future.
0028 # The '%' is replaced with the source directory path for checking.
0029 # Episodes are handled specially, and extra files in '_extras' are also handled
0030 # specially. This list must include all the Markdown files listed in the
0031 # 'bin/initialize' script.
0032 REQUIRED_FILES = {
0033     'CODE_OF_CONDUCT.md': True,
0034     'CONTRIBUTING.md': False,
0035     'LICENSE.md': True,
0036     'README.md': False,
0037     os.path.join('_extras', 'discuss.md'): True,
0038     os.path.join('_extras', 'guide.md'): True,
0039     'index.md': True,
0040     'reference.md': True,
0041     'setup.md': True,
0042 }
0043
0044 # Episode filename pattern.
0045 P_EPISODE_FILENAME = re.compile(r'(\d\d)-[-\w]+.md$')
0046
0047 # Pattern to match lines ending with whitespace.
0048 P_TRAILING_WHITESPACE = re.compile(r'\s+$')
0049
0050 # Pattern to match figure references in HTML.
0051 P_FIGURE_REFS = re.compile(r'<img[^>]+src="([^"]+)"[^>]*>')
0052
0053 # Pattern to match internally-defined Markdown links.
0054 P_INTERNAL_LINK_REF = re.compile(r'\[([^\]]+)\]\[([^\]]+)\]')
0055
0056 # Pattern to match reference links (to resolve internally-defined references).
0057 P_INTERNAL_LINK_DEF = re.compile(r'^\[([^\]]+)\]:\s*(.+)')
0058
0059 # Pattern to match {% include ... %} statements
0060 P_INTERNAL_INCLUDE_LINK = re.compile(r'^{% include ([^ ]*) %}$')
0061
0062 # Pattern to match image-only and link-only lines
0063 P_LINK_IMAGE_LINE = re.compile(r'''
0064     [> #]*        # any number of '>', '#', and spaces
0065     \W{,3}        # up to 3 non-word characters
0066     !?            # ! or nothing
0067     \[[^]]+\]     # [any text]
0068     [([]          # ( or [
0069     [^])]+        # 1+ characters that are neither ] nor )
0070     [])]          # ] or )
0071     (?:{:[^}]+})? # {:any text} or nothing
0072     \W{,3}        # up to 3 non-word characters
0073     [ ]*          # any number of spaces
0074     \\?$          # \ or nothing + end of line''', re.VERBOSE)
0075
0076 # What kinds of blockquotes are allowed?
0077 KNOWN_BLOCKQUOTES = {
0078     'callout',
0079     'caution',
0080     'challenge',
0081     'checklist',
0082     'discussion',
0083     'keypoints',
0084     'objectives',
0085     'prereq',
0086     'quotation',
0087     'solution',
0088     'testimonial',
0089     'warning'
0090 }
0091
0092 # What kinds of code fragments are allowed?
0093 # Below we allow all 'language-*' code blocks
0094 KNOWN_CODEBLOCKS = {
0095     'error',
0096     'output',
0097     'source',
0098     'warning'
0099 }
0100
0101 # What fields are required in teaching episode metadata?
0102 TEACHING_METADATA_FIELDS = {
0103     ('title', str),
0104     ('teaching', int),
0105     ('exercises', int),
0106     ('questions', list),
0107     ('objectives', list),
0108     ('keypoints', list)
0109 }
0110
0111 # What fields are required in break episode metadata?
0112 BREAK_METADATA_FIELDS = {
0113     ('layout', str),
0114     ('title', str),
0115     ('break', int)
0116 }
0117
0118 # How long are lines allowed to be?
0119 # Please keep this in sync with .editorconfig!
0120 MAX_LINE_LEN = 100
0121
0122 # Contents of _config.yml
0123 CONFIG = {}
0124
0125 def main():
0126     """Main driver."""
0127
0128     args = parse_args()
0129     args.reporter = Reporter()
0130
0131     global CONFIG
0132     config_file = os.path.join(args.source_dir, '_config.yml')
0133     CONFIG = load_yaml(config_file)
0134     CONFIG["config_file"] = config_file
0135
0136     life_cycle = CONFIG.get('life_cycle', None)
0137     # pre-alpha lessons should report without error
0138     if life_cycle == "pre-alpha":
0139         args.permissive = True
0140
0141     check_config(args.reporter)
0142     check_source_rmd(args.reporter, args.source_dir, args.parser)
0143
0144     args.references = read_references(args.reporter, args.reference_path)
0145
0146     docs = read_all_markdown(args.source_dir, args.parser)
0147     check_fileset(args.source_dir, args.reporter, list(docs.keys()))
0148     check_unwanted_files(args.source_dir, args.reporter)
0149     for filename in list(docs.keys()):
0150         checker = create_checker(args, filename, docs[filename])
0151         checker.check()
0152
0153     args.reporter.report()
0154     if args.reporter.messages:
0155         if args.permissive:
0156             print("Problems detected but ignored (permissive mode).")
0157         else:
0158             print("Problems detected.")
0159             sys.exit(1)
0160     else:
0161         print("No problems found.")
0162
0163     return
0164
0165
0166 def parse_args():
0167     """Parse command-line arguments."""
0168
0169     parser = ArgumentParser(description="""Check episode files in a lesson.""")
0170     parser.add_argument('-l', '--linelen',
0171                         default=False,
0172                         action="store_true",
0173                         dest='line_lengths',
0174                         help='Check line lengths')
0175     parser.add_argument('-p', '--parser',
0176                         default=None,
0177                         dest='parser',
0178                         help='path to Markdown parser')
0179     parser.add_argument('-r', '--references',
0180                         default=None,
0181                         dest='reference_path',
0182                         help='path to Markdown file of external references')
0183     parser.add_argument('-s', '--source',
0184                         default=os.curdir,
0185                         dest='source_dir',
0186                         help='source directory')
0187     parser.add_argument('-w', '--whitespace',
0188                         default=False,
0189                         action="store_true",
0190                         dest='trailing_whitespace',
0191                         help='Check for trailing whitespace')
0192     parser.add_argument('--permissive',
0193                         default=False,
0194                         action="store_true",
0195                         dest='permissive',
0196                         help='Do not raise an error even if issues are detected')
0197
0198     args, extras = parser.parse_known_args()
0199     require(args.parser is not None,
0200             'Path to Markdown parser not provided',
0201             True)
0202     require(not extras,
0203             'Unexpected trailing command-line arguments "{0}"'.format(extras))
0204
0205     return args
0206
0207 def check_config(reporter):
0208     """Check configuration file."""
0209
0210     reporter.check_field(CONFIG["config_file"], 'configuration',
0211                          CONFIG, 'kind', 'lesson')
0212     reporter.check_field(CONFIG["config_file"], 'configuration',
0213                          CONFIG, 'carpentry', ('swc', 'dc', 'lc', 'cp', 'incubator'))
0214     reporter.check_field(CONFIG["config_file"], 'configuration', CONFIG, 'title')
0215     reporter.check_field(CONFIG["config_file"], 'configuration', CONFIG, 'email')
0216
0217     for defaults in [
0218             {'values': {'root': '.', 'layout': 'page'}},
0219             {'values': {'root': '..', 'layout': 'episode'}, 'scope': {'type': 'episodes', 'path': ''}},
0220             {'values': {'root': '..', 'layout': 'page'}, 'scope': {'type': 'extras', 'path': ''}}
0221             ]:
0222         error_text = 'incorrect settings for: root "{0}" layout "{1}"'
0223         root = defaults["values"]["root"]
0224         layout = defaults["values"]["layout"]
0225         error_message = error_text.format(root, layout)
0226
0227         defaults_test = defaults in CONFIG.get('defaults', [])
0228         reporter.check(defaults_test, 'configuration', error_message)
0229
0230 def check_source_rmd(reporter, source_dir, parser):
0231     """Check that Rmd episode files include `source: Rmd`"""
0232
0233     episode_rmd_dir = [os.path.join(source_dir, d) for d in SOURCE_RMD_DIRS]
0234     episode_rmd_files = [os.path.join(d, '*.Rmd') for d in episode_rmd_dir]
0235     results = {}
0236     for pat in episode_rmd_files:
0237         for f in glob.glob(pat):
0238             data = read_markdown(parser, f)
0239             dy = data['metadata']
0240             if dy:
0241                 reporter.check_field(f, 'episode_rmd',
0242                                      dy, 'source', 'Rmd')
0243
0244 def read_references(reporter, ref_path):
0245     """Read shared file of reference links, returning dictionary of valid references
0246     {symbolic_name : URL}
0247     """
0248
0249     if 'remote_theme' in CONFIG:
0250         return {}
0251
0252     if not ref_path:
0253         raise Warning("No filename has been provided.")
0254
0255     result = {}
0256     urls_seen = set()
0257
0258     with open(ref_path, 'r', encoding='utf-8') as reader:
0259         for (num, line) in enumerate(reader, 1):
0260
0261             # Skip empty lines
0262             if len(line.strip()) == 0:
0263                 continue
0264
0265             # Skip HTML comments
0266             if line.strip().startswith("<!--") and line.strip().endswith("-->"):
0267                    continue
0268
0269             # Skip Liquid's {% include ... %} lines
0270             if P_INTERNAL_INCLUDE_LINK.search(line):
0271                 continue
0272
0273             m = P_INTERNAL_LINK_DEF.search(line)
0274
0275             message = '{}: {} not a valid reference: {}'
0276             require(m, message.format(ref_path, num, line.rstrip()))
0277
0278             name = m.group(1)
0279             url = m.group(2)
0280
0281             message = 'Empty reference at {0}:{1}'
0282             require(name, message.format(ref_path, num))
0283
0284             unique_name = name not in result
0285             unique_url = url not in urls_seen
0286
0287             reporter.check(unique_name,
0288                            ref_path,
0289                            'Duplicate reference name {0} at line {1}',
0290                            name, num)
0291
0292             reporter.check(unique_url,
0293                            ref_path,
0294                            'Duplicate definition of URL {0} at line {1}',
0295                            url, num)
0296
0297             result[name] = url
0298             urls_seen.add(url)
0299
0300     return result
0301
0302
0303 def read_all_markdown(source_dir, parser):
0304     """Read source files, returning
0305     {path : {'metadata':yaml, 'metadata_len':N, 'text':text, 'lines':[(i, line, len)], 'doc':doc}}
0306     """
0307
0308     all_dirs = [os.path.join(source_dir, d) for d in SOURCE_DIRS]
0309     all_patterns = [os.path.join(d, '*.md') for d in all_dirs]
0310     result = {}
0311     for pat in all_patterns:
0312         for filename in glob.glob(pat):
0313             data = read_markdown(parser, filename)
0314             if data:
0315                 result[filename] = data
0316     return result
0317
0318
0319 def check_fileset(source_dir, reporter, filenames_present):
0320     """Are all required files present? Are extraneous files present?"""
0321
0322     # Check files with predictable names.
0323     required = [os.path.join(source_dir, p) for p in REQUIRED_FILES]
0324     missing = set(required) - set(filenames_present)
0325     for m in missing:
0326         reporter.add(None, 'Missing required file {0}', m)
0327
0328     # Check episode files' names.
0329     seen = []
0330     for filename in filenames_present:
0331         if '_episodes' not in filename:
0332             continue
0333
0334         # split path to check episode name
0335         base_name = os.path.basename(filename)
0336         m = P_EPISODE_FILENAME.search(base_name)
0337         if m and m.group(1):
0338             seen.append(m.group(1))
0339         else:
0340             reporter.add(
0341                 None, 'Episode {0} has badly-formatted filename', filename)
0342
0343     # Check for duplicate episode numbers.
0344     reporter.check(len(seen) == len(set(seen)),
0345                    None,
0346                    'Duplicate episode numbers {0} vs {1}',
0347                    sorted(seen), sorted(set(seen)))
0348
0349     # Check that numbers are consecutive.
0350     seen = sorted([int(s) for s in seen])
0351     clean = True
0352     for i in range(len(seen) - 1):
0353         clean = clean and ((seen[i+1] - seen[i]) == 1)
0354     reporter.check(clean,
0355                    None,
0356                    'Missing or non-consecutive episode numbers {0}',
0357                    seen)
0358
0359
0360 def create_checker(args, filename, info):
0361     """Create appropriate checker for file."""
0362
0363     for (pat, cls) in CHECKERS:
0364         if pat.search(filename):
0365             return cls(args, filename, **info)
0366     return NotImplemented
0367
0368 class CheckBase:
0369     """Base class for checking Markdown files."""
0370
0371     def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
0372         """Cache arguments for checking."""
0373
0374         self.args = args
0375         self.reporter = self.args.reporter  # for convenience
0376         self.filename = filename
0377         self.metadata = metadata
0378         self.metadata_len = metadata_len
0379         self.text = text
0380         self.lines = lines
0381         self.doc = doc
0382
0383         self.layout = None
0384
0385     def check(self):
0386         """Run tests."""
0387
0388         self.check_metadata()
0389         self.check_line_lengths()
0390         self.check_trailing_whitespace()
0391         self.check_blockquote_classes()
0392         self.check_codeblock_classes()
0393         self.check_defined_link_references()
0394
0395     def check_metadata(self):
0396         """Check the YAML metadata."""
0397
0398         self.reporter.check(self.metadata is not None,
0399                             self.filename,
0400                             'Missing metadata entirely')
0401
0402         if self.metadata and (self.layout is not None):
0403             self.reporter.check_field(
0404                 self.filename, 'metadata', self.metadata, 'layout', self.layout)
0405
0406     def check_line_lengths(self):
0407         """Check the raw text of the lesson body."""
0408
0409         if self.args.line_lengths:
0410             over_limit = []
0411
0412             for (i, l, n) in self.lines:
0413                 # Report lines that are longer than the suggested
0414                 # line length limit only if they're not
0415                 # link-only or image-only lines.
0416                 if n > MAX_LINE_LEN and not P_LINK_IMAGE_LINE.match(l):
0417                     over_limit.append(i)
0418
0419             self.reporter.check(not over_limit,
0420                                 self.filename,
0421                                 'Line(s) too long: {0}',
0422                                 ', '.join([str(i) for i in over_limit]))
0423
0424     def check_trailing_whitespace(self):
0425         """Check for whitespace at the ends of lines."""
0426
0427         if self.args.trailing_whitespace:
0428             trailing = [
0429                 i for (i, l, n) in self.lines if P_TRAILING_WHITESPACE.match(l)]
0430             self.reporter.check(not trailing,
0431                                 self.filename,
0432                                 'Line(s) end with whitespace: {0}',
0433                                 ', '.join([str(i) for i in trailing]))
0434
0435     def check_blockquote_classes(self):
0436         """Check that all blockquotes have known classes."""
0437
0438         for node in self.find_all(self.doc, {'type': 'blockquote'}):
0439             cls = self.get_val(node, 'attr', 'class')
0440             self.reporter.check(cls in KNOWN_BLOCKQUOTES,
0441                                 (self.filename, self.get_loc(node)),
0442                                 'Unknown or missing blockquote type {0}',
0443                                 cls)
0444
0445     def check_codeblock_classes(self):
0446         """Check that all code blocks have known classes."""
0447
0448         for node in self.find_all(self.doc, {'type': 'codeblock'}):
0449             cls = self.get_val(node, 'attr', 'class')
0450             self.reporter.check(cls is not None and (cls in KNOWN_CODEBLOCKS or
0451                 cls.startswith('language-')),
0452                                 (self.filename, self.get_loc(node)),
0453                                 'Unknown or missing code block type {0}',
0454                                 cls)
0455
0456     def check_defined_link_references(self):
0457         """Check that defined links resolve in the file.
0458
0459         Internally-defined links match the pattern [text][label].
0460         """
0461
0462         result = set()
0463         for node in self.find_all(self.doc, {'type': 'text'}):
0464             for match in P_INTERNAL_LINK_REF.findall(node['value']):
0465                 text = match[0]
0466                 link = match[1]
0467                 if link not in self.args.references:
0468                     result.add('"{0}"=>"{1}"'.format(text, link))
0469         self.reporter.check(not result,
0470                             self.filename,
0471                             'Internally-defined links may be missing definitions: {0}',
0472                             ', '.join(sorted(result)))
0473
0474     def find_all(self, node, pattern, accum=None):
0475         """Find all matches for a pattern."""
0476
0477         assert isinstance(pattern, dict), 'Patterns must be dictionaries'
0478         if accum is None:
0479             accum = []
0480         if self.match(node, pattern):
0481             accum.append(node)
0482         for child in node.get('children', []):
0483             self.find_all(child, pattern, accum)
0484         return accum
0485
0486     def match(self, node, pattern):
0487         """Does this node match the given pattern?"""
0488
0489         for key in pattern:
0490             if key not in node:
0491                 return False
0492             val = pattern[key]
0493             if isinstance(val, str):
0494                 if node[key] != val:
0495                     return False
0496             elif isinstance(val, dict):
0497                 if not self.match(node[key], val):
0498                     return False
0499         return True
0500
0501     @staticmethod
0502     def get_val(node, *chain):
0503         """Get value one or more levels down."""
0504
0505         curr = node
0506         for selector in chain:
0507             curr = curr.get(selector, None)
0508             if curr is None:
0509                 break
0510         return curr
0511
0512     def get_loc(self, node):
0513         """Convenience method to get node's line number."""
0514
0515         result = self.get_val(node, 'options', 'location')
0516         if self.metadata_len is not None:
0517             result += self.metadata_len
0518         return result
0519
0520
0521 class CheckNonJekyll(CheckBase):
0522     """Check a file that isn't translated by Jekyll."""
0523
0524     def check_metadata(self):
0525         self.reporter.check(self.metadata is None,
0526                             self.filename,
0527                             'Unexpected metadata')
0528
0529
0530 class CheckIndex(CheckBase):
0531     """Check the main index page."""
0532
0533     def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
0534         super().__init__(args, filename, metadata, metadata_len, text, lines, doc)
0535         self.layout = 'lesson'
0536
0537     def check_metadata(self):
0538         super().check_metadata()
0539         self.reporter.check(self.metadata.get('root', '') == '.',
0540                             self.filename,
0541                             'Root not set to "."')
0542
0543
0544 class CheckEpisode(CheckBase):
0545     """Check an episode page."""
0546
0547     def check(self):
0548         """Run extra tests."""
0549
0550         super().check()
0551         self.check_reference_inclusion()
0552
0553     def check_metadata(self):
0554         super().check_metadata()
0555         if self.metadata:
0556             if 'layout' in self.metadata:
0557                 if self.metadata['layout'] == 'break':
0558                     self.check_metadata_fields(BREAK_METADATA_FIELDS)
0559                 else:
0560                     self.reporter.add(self.filename,
0561                                       'Unknown episode layout "{0}"',
0562                                       self.metadata['layout'])
0563             else:
0564                 self.check_metadata_fields(TEACHING_METADATA_FIELDS)
0565
0566     def check_metadata_fields(self, expected):
0567         """Check metadata fields."""
0568         for (name, type_) in expected:
0569             if name not in self.metadata:
0570                 self.reporter.add(self.filename,
0571                                   'Missing metadata field {0}',
0572                                   name)
0573             elif not isinstance(self.metadata[name], type_):
0574                 self.reporter.add(self.filename,
0575                                   '"{0}" has wrong type in metadata ({1} instead of {2})',
0576                                   name, type(self.metadata[name]), type_)
0577
0578     def check_reference_inclusion(self):
0579         """Check that links file has been included."""
0580
0581         if 'remote_theme' in CONFIG:
0582             return
0583
0584         if not self.args.reference_path:
0585             return
0586
0587         for (i, last_line, line_len) in reversed(self.lines):
0588             if last_line:
0589                 break
0590
0591         require(last_line,
0592                 'No non-empty lines in {0}'.format(self.filename))
0593
0594         include_filename = os.path.split(self.args.reference_path)[-1]
0595         if include_filename not in last_line:
0596             self.reporter.add(self.filename,
0597                               'episode does not include "{0}"',
0598                               include_filename)
0599
0600
0601 class CheckReference(CheckBase):
0602     """Check the reference page."""
0603
0604     def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
0605         super().__init__(args, filename, metadata, metadata_len, text, lines, doc)
0606         self.layout = 'reference'
0607
0608
0609 class CheckGeneric(CheckBase):
0610     """Check a generic page."""
0611
0612     def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
0613         super().__init__(args, filename, metadata, metadata_len, text, lines, doc)
0614
0615
0616 CHECKERS = [
0617     (re.compile(r'CONTRIBUTING\.md'), CheckNonJekyll),
0618     (re.compile(r'README\.md'), CheckNonJekyll),
0619     (re.compile(r'index\.md'), CheckIndex),
0620     (re.compile(r'reference\.md'), CheckReference),
0621     # '.' below is what's passed on the command line via '-s' flag
0622     (re.compile(os.path.join('.','_episodes', '[^/]*\.md')), CheckEpisode),
0623     (re.compile(r'.*\.md'), CheckGeneric)
0624 ]
0625
0626
0627 if __name__ == '__main__':
0628     main()