File indexing completed on 2025-01-18 10:18:47
0001 """
0002 Check lesson files and their contents.
0003 """
0004
0005
0006 import os
0007 import glob
0008 import re
0009 import sys
0010 from argparse import ArgumentParser
0011
0012
0013
0014 from util import *
0015 from reporter import Reporter
0016
0017 __version__ = '0.3'
0018
0019
0020 SOURCE_DIRS = ['', '_episodes', '_extras']
0021
0022
0023 SOURCE_RMD_DIRS = ['_episodes_rmd']
0024
0025
0026
0027
0028
0029
0030
0031
0032 REQUIRED_FILES = {
0033 'CODE_OF_CONDUCT.md': True,
0034 'CONTRIBUTING.md': False,
0035 'LICENSE.md': True,
0036 'README.md': False,
0037 os.path.join('_extras', 'discuss.md'): True,
0038 os.path.join('_extras', 'guide.md'): True,
0039 'index.md': True,
0040 'reference.md': True,
0041 'setup.md': True,
0042 }
0043
0044
0045 P_EPISODE_FILENAME = re.compile(r'(\d\d)-[-\w]+.md$')
0046
0047
0048 P_TRAILING_WHITESPACE = re.compile(r'\s+$')
0049
0050
0051 P_FIGURE_REFS = re.compile(r'<img[^>]+src="([^"]+)"[^>]*>')
0052
0053
0054 P_INTERNAL_LINK_REF = re.compile(r'\[([^\]]+)\]\[([^\]]+)\]')
0055
0056
0057 P_INTERNAL_LINK_DEF = re.compile(r'^\[([^\]]+)\]:\s*(.+)')
0058
0059
0060 P_INTERNAL_INCLUDE_LINK = re.compile(r'^{% include ([^ ]*) %}$')
0061
0062
0063 P_LINK_IMAGE_LINE = re.compile(r'''
0064 [> #]* # any number of '>', '#', and spaces
0065 \W{,3} # up to 3 non-word characters
0066 !? # ! or nothing
0067 \[[^]]+\] # [any text]
0068 [([] # ( or [
0069 [^])]+ # 1+ characters that are neither ] nor )
0070 [])] # ] or )
0071 (?:{:[^}]+})? # {:any text} or nothing
0072 \W{,3} # up to 3 non-word characters
0073 [ ]* # any number of spaces
0074 \\?$ # \ or nothing + end of line''', re.VERBOSE)
0075
0076
0077 KNOWN_BLOCKQUOTES = {
0078 'callout',
0079 'caution',
0080 'challenge',
0081 'checklist',
0082 'discussion',
0083 'keypoints',
0084 'objectives',
0085 'prereq',
0086 'quotation',
0087 'solution',
0088 'testimonial',
0089 'warning'
0090 }
0091
0092
0093
0094 KNOWN_CODEBLOCKS = {
0095 'error',
0096 'output',
0097 'source',
0098 'warning'
0099 }
0100
0101
0102 TEACHING_METADATA_FIELDS = {
0103 ('title', str),
0104 ('teaching', int),
0105 ('exercises', int),
0106 ('questions', list),
0107 ('objectives', list),
0108 ('keypoints', list)
0109 }
0110
0111
0112 BREAK_METADATA_FIELDS = {
0113 ('layout', str),
0114 ('title', str),
0115 ('break', int)
0116 }
0117
0118
0119
0120 MAX_LINE_LEN = 100
0121
0122
0123 CONFIG = {}
0124
0125 def main():
0126 """Main driver."""
0127
0128 args = parse_args()
0129 args.reporter = Reporter()
0130
0131 global CONFIG
0132 config_file = os.path.join(args.source_dir, '_config.yml')
0133 CONFIG = load_yaml(config_file)
0134 CONFIG["config_file"] = config_file
0135
0136 life_cycle = CONFIG.get('life_cycle', None)
0137
0138 if life_cycle == "pre-alpha":
0139 args.permissive = True
0140
0141 check_config(args.reporter)
0142 check_source_rmd(args.reporter, args.source_dir, args.parser)
0143
0144 args.references = read_references(args.reporter, args.reference_path)
0145
0146 docs = read_all_markdown(args.source_dir, args.parser)
0147 check_fileset(args.source_dir, args.reporter, list(docs.keys()))
0148 check_unwanted_files(args.source_dir, args.reporter)
0149 for filename in list(docs.keys()):
0150 checker = create_checker(args, filename, docs[filename])
0151 checker.check()
0152
0153 args.reporter.report()
0154 if args.reporter.messages:
0155 if args.permissive:
0156 print("Problems detected but ignored (permissive mode).")
0157 else:
0158 print("Problems detected.")
0159 sys.exit(1)
0160 else:
0161 print("No problems found.")
0162
0163 return
0164
0165
0166 def parse_args():
0167 """Parse command-line arguments."""
0168
0169 parser = ArgumentParser(description="""Check episode files in a lesson.""")
0170 parser.add_argument('-l', '--linelen',
0171 default=False,
0172 action="store_true",
0173 dest='line_lengths',
0174 help='Check line lengths')
0175 parser.add_argument('-p', '--parser',
0176 default=None,
0177 dest='parser',
0178 help='path to Markdown parser')
0179 parser.add_argument('-r', '--references',
0180 default=None,
0181 dest='reference_path',
0182 help='path to Markdown file of external references')
0183 parser.add_argument('-s', '--source',
0184 default=os.curdir,
0185 dest='source_dir',
0186 help='source directory')
0187 parser.add_argument('-w', '--whitespace',
0188 default=False,
0189 action="store_true",
0190 dest='trailing_whitespace',
0191 help='Check for trailing whitespace')
0192 parser.add_argument('--permissive',
0193 default=False,
0194 action="store_true",
0195 dest='permissive',
0196 help='Do not raise an error even if issues are detected')
0197
0198 args, extras = parser.parse_known_args()
0199 require(args.parser is not None,
0200 'Path to Markdown parser not provided',
0201 True)
0202 require(not extras,
0203 'Unexpected trailing command-line arguments "{0}"'.format(extras))
0204
0205 return args
0206
0207 def check_config(reporter):
0208 """Check configuration file."""
0209
0210 reporter.check_field(CONFIG["config_file"], 'configuration',
0211 CONFIG, 'kind', 'lesson')
0212 reporter.check_field(CONFIG["config_file"], 'configuration',
0213 CONFIG, 'carpentry', ('swc', 'dc', 'lc', 'cp', 'incubator'))
0214 reporter.check_field(CONFIG["config_file"], 'configuration', CONFIG, 'title')
0215 reporter.check_field(CONFIG["config_file"], 'configuration', CONFIG, 'email')
0216
0217 for defaults in [
0218 {'values': {'root': '.', 'layout': 'page'}},
0219 {'values': {'root': '..', 'layout': 'episode'}, 'scope': {'type': 'episodes', 'path': ''}},
0220 {'values': {'root': '..', 'layout': 'page'}, 'scope': {'type': 'extras', 'path': ''}}
0221 ]:
0222 error_text = 'incorrect settings for: root "{0}" layout "{1}"'
0223 root = defaults["values"]["root"]
0224 layout = defaults["values"]["layout"]
0225 error_message = error_text.format(root, layout)
0226
0227 defaults_test = defaults in CONFIG.get('defaults', [])
0228 reporter.check(defaults_test, 'configuration', error_message)
0229
0230 def check_source_rmd(reporter, source_dir, parser):
0231 """Check that Rmd episode files include `source: Rmd`"""
0232
0233 episode_rmd_dir = [os.path.join(source_dir, d) for d in SOURCE_RMD_DIRS]
0234 episode_rmd_files = [os.path.join(d, '*.Rmd') for d in episode_rmd_dir]
0235 results = {}
0236 for pat in episode_rmd_files:
0237 for f in glob.glob(pat):
0238 data = read_markdown(parser, f)
0239 dy = data['metadata']
0240 if dy:
0241 reporter.check_field(f, 'episode_rmd',
0242 dy, 'source', 'Rmd')
0243
0244 def read_references(reporter, ref_path):
0245 """Read shared file of reference links, returning dictionary of valid references
0246 {symbolic_name : URL}
0247 """
0248
0249 if 'remote_theme' in CONFIG:
0250 return {}
0251
0252 if not ref_path:
0253 raise Warning("No filename has been provided.")
0254
0255 result = {}
0256 urls_seen = set()
0257
0258 with open(ref_path, 'r', encoding='utf-8') as reader:
0259 for (num, line) in enumerate(reader, 1):
0260
0261
0262 if len(line.strip()) == 0:
0263 continue
0264
0265
0266 if line.strip().startswith("<!--") and line.strip().endswith("-->"):
0267 continue
0268
0269
0270 if P_INTERNAL_INCLUDE_LINK.search(line):
0271 continue
0272
0273 m = P_INTERNAL_LINK_DEF.search(line)
0274
0275 message = '{}: {} not a valid reference: {}'
0276 require(m, message.format(ref_path, num, line.rstrip()))
0277
0278 name = m.group(1)
0279 url = m.group(2)
0280
0281 message = 'Empty reference at {0}:{1}'
0282 require(name, message.format(ref_path, num))
0283
0284 unique_name = name not in result
0285 unique_url = url not in urls_seen
0286
0287 reporter.check(unique_name,
0288 ref_path,
0289 'Duplicate reference name {0} at line {1}',
0290 name, num)
0291
0292 reporter.check(unique_url,
0293 ref_path,
0294 'Duplicate definition of URL {0} at line {1}',
0295 url, num)
0296
0297 result[name] = url
0298 urls_seen.add(url)
0299
0300 return result
0301
0302
0303 def read_all_markdown(source_dir, parser):
0304 """Read source files, returning
0305 {path : {'metadata':yaml, 'metadata_len':N, 'text':text, 'lines':[(i, line, len)], 'doc':doc}}
0306 """
0307
0308 all_dirs = [os.path.join(source_dir, d) for d in SOURCE_DIRS]
0309 all_patterns = [os.path.join(d, '*.md') for d in all_dirs]
0310 result = {}
0311 for pat in all_patterns:
0312 for filename in glob.glob(pat):
0313 data = read_markdown(parser, filename)
0314 if data:
0315 result[filename] = data
0316 return result
0317
0318
0319 def check_fileset(source_dir, reporter, filenames_present):
0320 """Are all required files present? Are extraneous files present?"""
0321
0322
0323 required = [os.path.join(source_dir, p) for p in REQUIRED_FILES]
0324 missing = set(required) - set(filenames_present)
0325 for m in missing:
0326 reporter.add(None, 'Missing required file {0}', m)
0327
0328
0329 seen = []
0330 for filename in filenames_present:
0331 if '_episodes' not in filename:
0332 continue
0333
0334
0335 base_name = os.path.basename(filename)
0336 m = P_EPISODE_FILENAME.search(base_name)
0337 if m and m.group(1):
0338 seen.append(m.group(1))
0339 else:
0340 reporter.add(
0341 None, 'Episode {0} has badly-formatted filename', filename)
0342
0343
0344 reporter.check(len(seen) == len(set(seen)),
0345 None,
0346 'Duplicate episode numbers {0} vs {1}',
0347 sorted(seen), sorted(set(seen)))
0348
0349
0350 seen = sorted([int(s) for s in seen])
0351 clean = True
0352 for i in range(len(seen) - 1):
0353 clean = clean and ((seen[i+1] - seen[i]) == 1)
0354 reporter.check(clean,
0355 None,
0356 'Missing or non-consecutive episode numbers {0}',
0357 seen)
0358
0359
0360 def create_checker(args, filename, info):
0361 """Create appropriate checker for file."""
0362
0363 for (pat, cls) in CHECKERS:
0364 if pat.search(filename):
0365 return cls(args, filename, **info)
0366 return NotImplemented
0367
0368 class CheckBase:
0369 """Base class for checking Markdown files."""
0370
0371 def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
0372 """Cache arguments for checking."""
0373
0374 self.args = args
0375 self.reporter = self.args.reporter
0376 self.filename = filename
0377 self.metadata = metadata
0378 self.metadata_len = metadata_len
0379 self.text = text
0380 self.lines = lines
0381 self.doc = doc
0382
0383 self.layout = None
0384
0385 def check(self):
0386 """Run tests."""
0387
0388 self.check_metadata()
0389 self.check_line_lengths()
0390 self.check_trailing_whitespace()
0391 self.check_blockquote_classes()
0392 self.check_codeblock_classes()
0393 self.check_defined_link_references()
0394
0395 def check_metadata(self):
0396 """Check the YAML metadata."""
0397
0398 self.reporter.check(self.metadata is not None,
0399 self.filename,
0400 'Missing metadata entirely')
0401
0402 if self.metadata and (self.layout is not None):
0403 self.reporter.check_field(
0404 self.filename, 'metadata', self.metadata, 'layout', self.layout)
0405
0406 def check_line_lengths(self):
0407 """Check the raw text of the lesson body."""
0408
0409 if self.args.line_lengths:
0410 over_limit = []
0411
0412 for (i, l, n) in self.lines:
0413
0414
0415
0416 if n > MAX_LINE_LEN and not P_LINK_IMAGE_LINE.match(l):
0417 over_limit.append(i)
0418
0419 self.reporter.check(not over_limit,
0420 self.filename,
0421 'Line(s) too long: {0}',
0422 ', '.join([str(i) for i in over_limit]))
0423
0424 def check_trailing_whitespace(self):
0425 """Check for whitespace at the ends of lines."""
0426
0427 if self.args.trailing_whitespace:
0428 trailing = [
0429 i for (i, l, n) in self.lines if P_TRAILING_WHITESPACE.match(l)]
0430 self.reporter.check(not trailing,
0431 self.filename,
0432 'Line(s) end with whitespace: {0}',
0433 ', '.join([str(i) for i in trailing]))
0434
0435 def check_blockquote_classes(self):
0436 """Check that all blockquotes have known classes."""
0437
0438 for node in self.find_all(self.doc, {'type': 'blockquote'}):
0439 cls = self.get_val(node, 'attr', 'class')
0440 self.reporter.check(cls in KNOWN_BLOCKQUOTES,
0441 (self.filename, self.get_loc(node)),
0442 'Unknown or missing blockquote type {0}',
0443 cls)
0444
0445 def check_codeblock_classes(self):
0446 """Check that all code blocks have known classes."""
0447
0448 for node in self.find_all(self.doc, {'type': 'codeblock'}):
0449 cls = self.get_val(node, 'attr', 'class')
0450 self.reporter.check(cls is not None and (cls in KNOWN_CODEBLOCKS or
0451 cls.startswith('language-')),
0452 (self.filename, self.get_loc(node)),
0453 'Unknown or missing code block type {0}',
0454 cls)
0455
0456 def check_defined_link_references(self):
0457 """Check that defined links resolve in the file.
0458
0459 Internally-defined links match the pattern [text][label].
0460 """
0461
0462 result = set()
0463 for node in self.find_all(self.doc, {'type': 'text'}):
0464 for match in P_INTERNAL_LINK_REF.findall(node['value']):
0465 text = match[0]
0466 link = match[1]
0467 if link not in self.args.references:
0468 result.add('"{0}"=>"{1}"'.format(text, link))
0469 self.reporter.check(not result,
0470 self.filename,
0471 'Internally-defined links may be missing definitions: {0}',
0472 ', '.join(sorted(result)))
0473
0474 def find_all(self, node, pattern, accum=None):
0475 """Find all matches for a pattern."""
0476
0477 assert isinstance(pattern, dict), 'Patterns must be dictionaries'
0478 if accum is None:
0479 accum = []
0480 if self.match(node, pattern):
0481 accum.append(node)
0482 for child in node.get('children', []):
0483 self.find_all(child, pattern, accum)
0484 return accum
0485
0486 def match(self, node, pattern):
0487 """Does this node match the given pattern?"""
0488
0489 for key in pattern:
0490 if key not in node:
0491 return False
0492 val = pattern[key]
0493 if isinstance(val, str):
0494 if node[key] != val:
0495 return False
0496 elif isinstance(val, dict):
0497 if not self.match(node[key], val):
0498 return False
0499 return True
0500
0501 @staticmethod
0502 def get_val(node, *chain):
0503 """Get value one or more levels down."""
0504
0505 curr = node
0506 for selector in chain:
0507 curr = curr.get(selector, None)
0508 if curr is None:
0509 break
0510 return curr
0511
0512 def get_loc(self, node):
0513 """Convenience method to get node's line number."""
0514
0515 result = self.get_val(node, 'options', 'location')
0516 if self.metadata_len is not None:
0517 result += self.metadata_len
0518 return result
0519
0520
0521 class CheckNonJekyll(CheckBase):
0522 """Check a file that isn't translated by Jekyll."""
0523
0524 def check_metadata(self):
0525 self.reporter.check(self.metadata is None,
0526 self.filename,
0527 'Unexpected metadata')
0528
0529
0530 class CheckIndex(CheckBase):
0531 """Check the main index page."""
0532
0533 def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
0534 super().__init__(args, filename, metadata, metadata_len, text, lines, doc)
0535 self.layout = 'lesson'
0536
0537 def check_metadata(self):
0538 super().check_metadata()
0539 self.reporter.check(self.metadata.get('root', '') == '.',
0540 self.filename,
0541 'Root not set to "."')
0542
0543
0544 class CheckEpisode(CheckBase):
0545 """Check an episode page."""
0546
0547 def check(self):
0548 """Run extra tests."""
0549
0550 super().check()
0551 self.check_reference_inclusion()
0552
0553 def check_metadata(self):
0554 super().check_metadata()
0555 if self.metadata:
0556 if 'layout' in self.metadata:
0557 if self.metadata['layout'] == 'break':
0558 self.check_metadata_fields(BREAK_METADATA_FIELDS)
0559 else:
0560 self.reporter.add(self.filename,
0561 'Unknown episode layout "{0}"',
0562 self.metadata['layout'])
0563 else:
0564 self.check_metadata_fields(TEACHING_METADATA_FIELDS)
0565
0566 def check_metadata_fields(self, expected):
0567 """Check metadata fields."""
0568 for (name, type_) in expected:
0569 if name not in self.metadata:
0570 self.reporter.add(self.filename,
0571 'Missing metadata field {0}',
0572 name)
0573 elif not isinstance(self.metadata[name], type_):
0574 self.reporter.add(self.filename,
0575 '"{0}" has wrong type in metadata ({1} instead of {2})',
0576 name, type(self.metadata[name]), type_)
0577
0578 def check_reference_inclusion(self):
0579 """Check that links file has been included."""
0580
0581 if 'remote_theme' in CONFIG:
0582 return
0583
0584 if not self.args.reference_path:
0585 return
0586
0587 for (i, last_line, line_len) in reversed(self.lines):
0588 if last_line:
0589 break
0590
0591 require(last_line,
0592 'No non-empty lines in {0}'.format(self.filename))
0593
0594 include_filename = os.path.split(self.args.reference_path)[-1]
0595 if include_filename not in last_line:
0596 self.reporter.add(self.filename,
0597 'episode does not include "{0}"',
0598 include_filename)
0599
0600
0601 class CheckReference(CheckBase):
0602 """Check the reference page."""
0603
0604 def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
0605 super().__init__(args, filename, metadata, metadata_len, text, lines, doc)
0606 self.layout = 'reference'
0607
0608
0609 class CheckGeneric(CheckBase):
0610 """Check a generic page."""
0611
0612 def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
0613 super().__init__(args, filename, metadata, metadata_len, text, lines, doc)
0614
0615
0616 CHECKERS = [
0617 (re.compile(r'CONTRIBUTING\.md'), CheckNonJekyll),
0618 (re.compile(r'README\.md'), CheckNonJekyll),
0619 (re.compile(r'index\.md'), CheckIndex),
0620 (re.compile(r'reference\.md'), CheckReference),
0621
0622 (re.compile(os.path.join('.','_episodes', '[^/]*\.md')), CheckEpisode),
0623 (re.compile(r'.*\.md'), CheckGeneric)
0624 ]
0625
0626
0627 if __name__ == '__main__':
0628 main()