| #!/usr/bin/env dart |
| // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file |
| // for details. All rights reserved. Use of this source code is governed by a |
| // BSD-style license that can be found in the LICENSE file. |
| // |
| // ---------------------------------------------------------------------- |
| // This is a very specialized tool which was created in order to support |
| // adding hash values used as location markers in the LaTeX source of the |
| // language specification. It is intended to take its input file as the |
| // first argument, an output file name as the second argument, and a |
| // hash listing file name as the third argument. From docs/language a |
| // typical usage would be as follows: |
| // |
| // dart ../../tools/addlatexhash.dart dartLangSpec.tex out.tex hash.txt |
| // |
| // This will produce a normalized variant out.tex of the language |
| // specification with hash values filled in, and a listing hash.txt of |
| // all the hash values along with the label of their textual context |
| // (section, subsection, subsubsection, paragraph) . For more details, |
| // please check the language specification source itself. |
| // |
| // NB: This utility assumes UN*X style line endings, \n, in the LaTeX |
| // source file received as input; it will not work with other styles. |
| |
| import 'dart:convert'; |
| import 'dart:io'; |
| |
| import 'package:convert/convert.dart'; |
| import 'package:crypto/crypto.dart'; |
| |
| // ---------------------------------------------------------------------- |
| // Normalization of the text: removal or normalization of parts that |
| // do not affect the output from latex, such as white space. |
| |
| final commentRE = new RegExp(r"[^\\]%.*"); // NB: . does not match \n. |
| final whitespaceAllRE = new RegExp(r"^\s+$"); |
| final whitespaceRE = new RegExp(r"(?:(?=\s).){2,}"); // \s except end-of-line |
| |
| /// Removes [match]ing part of [line], adjusting that part with the |
| /// given [startOffset] and [endOffset], bounded to be valid indices |
| /// into the string if needed, then inserts [glue] where text was |
| /// removed. If there is no match then [line] is returned. |
| cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { |
| if (match == null) return line; |
| var start = match.start + startOffset; |
| var end = match.end + endOffset; |
| var len = line.length; |
| if (start < 0) start = 0; |
| if (end > len) end = len; |
| return line.substring(0, start) + glue + line.substring(end); |
| } |
| |
| cutRegexp(line, re, {startOffset: 0, endOffset: 0, glue: ""}) { |
| return cutMatch(line, re.firstMatch(line), |
| startOffset: startOffset, endOffset: endOffset, glue: glue); |
| } |
| |
| /// Removes the rest of [line] starting from the beginning of the |
| /// given [match], and adjusting with the given [offset]. If there |
| /// is no match then [line] is returned. |
| cutFromMatch(line, match, {offset: 0, glue: ""}) { |
| if (match == null) return line; |
| return line.substring(0, match.start + offset) + glue; |
| } |
| |
| cutFromRegexp(line, re, {offset: 0, glue: ""}) { |
| return cutFromMatch(line, re.firstMatch(line), offset: offset, glue: glue); |
| } |
| |
| isWsOnly(line) => line.contains(whitespaceAllRE); |
| isCommentOnly(line) => line.startsWith("%"); |
| |
| /// Returns the end-of-line character at the end of [line], if any, |
| /// otherwise returns the empty string. |
| justEol(line) { |
| return line.endsWith("\n") ? "\n" : ""; |
| } |
| |
| /// Removes the contents of the comment at the end of [line], |
| /// leaving the "%" in place. If no comment is present, |
| /// return [line]. |
| /// |
| /// NB: it is tempting to remove everything from the '%' and out, |
| /// including the final newline, if any, but this does not work. |
| /// The problem is that TeX will do exactly this, but then it will |
| /// add back a character that depends on its state (S, M, or N), |
| /// and it is tricky to maintain a similar state that matches the |
| /// state of TeX faithfully. Hence, we remove the content of |
| /// comments but do not remove the comments themselves, we just |
| /// leave the '%' at the end of the line and let TeX manage its |
| /// states in a way that does not differ from the file from before |
| /// stripComment. |
| stripComment(line) { |
| if (isCommentOnly(line)) return "%\n"; |
| return cutRegexp(line, commentRE, startOffset: 2); |
| } |
| |
| /// Reduces a white-space-only [line] to its eol character, |
| /// removes leading ws entirely, and reduces multiple |
| /// white-space chars to one. |
| normalizeWhitespace(line) { |
| var trimLine = line.trimLeft(); |
| if (trimLine.isEmpty) return justEol(line); |
| return trimLine.replaceAll(whitespaceRE, " "); |
| } |
| |
| /// Reduces sequences of >1 white-space-only lines in [lines] to 1, |
| /// and sequences of >1 comment-only lines to 1. Treats comment-only |
| /// lines as white-space-only when they occur in white-space-only |
| /// line blocks. |
| multilineNormalize(lines) { |
| var afterBlankLines = false; // Does [line] succeed >0 empty lines? |
| var afterCommentLines = false; // Does [line] succeed >0 commentOnly lines? |
| var newLines = new List(); |
| for (var line in lines) { |
| if (afterBlankLines && afterCommentLines) { |
| // Previous line was both blank and a comment: not possible. |
| throw "Bug, please report to eernst@"; |
| } else if (afterBlankLines && !afterCommentLines) { |
| // At least one line before [line] is wsOnly. |
| if (!isWsOnly(line)) { |
| // Blank line block ended. |
| afterCommentLines = isCommentOnly(line); |
| // Special case: It seems to be safe to remove commentOnly lines |
| // after wsOnly lines, so the TeX state must be predictably right; |
| // next line will then be afterCommentLines and be dropped, so |
| // we drop the entire comment block---which is very useful. We can |
| // also consider this comment line to be an empty line, such that |
| // subsequent empty lines can be considered to be in a block of |
| // empty lines. Note that almost all variants of this breaks. |
| if (afterCommentLines) { |
| // _Current_ 'line' is a commentOnly here. |
| afterBlankLines = true; |
| afterCommentLines = false; |
| // Omit addition of [line]. |
| } else { |
| // After blanks, but current 'line' is neither blank nor comment. |
| afterBlankLines = false; |
| newLines.add(line); |
| } |
| } else { |
| // Blank line block continues, omit addition of [line]. |
| } |
| } else if (!afterBlankLines && afterCommentLines) { |
| // At least one line before [line] is commentOnly. |
| if (!isCommentOnly(line)) { |
| // Comment block ended. |
| afterBlankLines = isWsOnly(line); |
| afterCommentLines = false; |
| newLines.add(line); |
| } else { |
| // Comment block continues, do not add [line]. |
| } |
| } else { |
| assert(!afterBlankLines && !afterCommentLines); |
| // No wsOnly or commentOnly lines precede [line]. |
| afterBlankLines = isWsOnly(line); |
| afterCommentLines = isCommentOnly(line); |
| if (!afterCommentLines) { |
| newLines.add(line); |
| } else { |
| // skip commentOnly line after nonWs/nonComment text. |
| } |
| } |
| } |
| return newLines; |
| } |
| |
| /// Selects the elements in the normalization pipeline. |
| normalize(line) => normalizeWhitespace(stripComment(line)); |
| |
| /// Selects the elements in the significant-spacing block |
| /// normalization pipeline. |
| sispNormalize(line) => stripComment(line); |
| |
| // Managing fragments with significant spacing. |
| |
| final dartCodeBeginRE = new RegExp(r"^\s*\\begin\s*\{dartCode\}"); |
| final dartCodeEndRE = new RegExp(r"^\s*\\end\s*\{dartCode\}"); |
| |
| /// Recognizes beginning of dartCode block. |
| sispIsDartBegin(line) => line.contains(dartCodeBeginRE); |
| |
| /// Recognizes end of dartCode block. |
| sispIsDartEnd(line) => line.contains(dartCodeEndRE); |
| |
| // ---------------------------------------------------------------------- |
| // Analyzing the input to point out "interesting" lines |
| |
| /// Returns the event information for [lines] as determined by the |
| /// given [analyzer]. The method [analyzer.analyze] indicates that a |
| /// line is "uninteresting" by returning null (i.e., no events here), |
| /// and "interesting" lines may be characterized by [analysisFunc] via |
| /// the returned event object. |
| findEvents(lines, analyzer) { |
| var events = new List(); |
| for (var line in lines) { |
| var event = analyzer.analyze(line); |
| if (event != null) events.add(event); |
| } |
| return events; |
| } |
| |
| /// Returns RegExp text for recognizing a command occupying a line |
| /// of its own, given the part of the RegExp that recognizes the |
| /// command name, [cmdNameRE] |
| lineCommandRE(cmdNameRE) => |
| new RegExp(r"^\s*\\" + cmdNameRE + r"\s*\{.*\}%?\s*$"); |
| |
| final hashLabelStartRE = new RegExp(r"^\s*\\LMLabel\s*\{"); |
| final hashLabelEndRE = new RegExp(r"\}\s*$"); |
| |
| final hashMarkRE = lineCommandRE("LMHash"); |
| final hashLabelRE = lineCommandRE("LMLabel"); |
| final sectioningRE = lineCommandRE("((|sub(|sub))section|paragraph)"); |
| final sectionRE = lineCommandRE("section"); |
| final subsectionRE = lineCommandRE("subsection"); |
| final subsubsectionRE = lineCommandRE("subsubsection"); |
| final paragraphRE = lineCommandRE("paragraph"); |
| |
| /// Returns true iff [line] begins a block of lines that gets a hash value. |
| isHashMarker(line) => line.contains(hashMarkRE); |
| |
| /// Returns true iff [line] defines a sectioning label. |
| isHashLabel(line) => line.contains(hashLabelRE); |
| |
| /// Returns true iff [line] is a sectioning command resp. one of its |
| /// more specific forms; note that it is assumed that sectioning commands |
| /// do not contain a newline between the command name and the '{'. |
| isSectioningCommand(line) => line.contains(sectioningRE); |
| isSectionCommand(line) => line.contains(sectionRE); |
| isSubsectionCommand(line) => line.contains(subsectionRE); |
| isSubsubsectionCommand(line) => line.contains(subsubsectionRE); |
| isParagraphCommand(line) => line.contains(paragraphRE); |
| |
| /// Returns true iff [line] does not end a block of lines that gets |
| /// a hash value. |
| bool isntHashBlockTerminator(line) => !isSectioningCommand(line); |
| |
| /// Returns the label text part from [line], based on the assumption |
| /// that isHashLabel(line) returns true. |
| extractHashLabel(line) { |
| var startMatch = hashLabelStartRE.firstMatch(line); |
| var endMatch = hashLabelEndRE.firstMatch(line); |
| assert(startMatch != null && endMatch != null); |
| return line.substring(startMatch.end, endMatch.start); |
| } |
| |
| // Event classes: Keep track of relevant information about the LaTeX |
| // source code lines, such as where \LMHash and \LMLabel commands are |
| // used, and how they are embedded in the sectioning structure. |
| |
| /// Abstract events, enabling us to [setEndLineNumber] on all events. |
| abstract class HashEvent { |
| /// For events that have an endLineNumber, set it; otherwise ignore. |
| /// The endLineNumber specifies the end of the block of lines |
| /// associated with a given event, for event types concerned with |
| /// blocks of lines rather than single lines. |
| setEndLineNumber(n) {} |
| |
| /// Returns null except for \LMHash{} events, where it returns |
| /// the startLineNumber. This serves to specify a boundary because |
| /// the preceding \LMHash{} block should stop before the line of |
| /// this \LMHash{} command. Note that hash blocks may stop earlier, |
| /// because they cannot contain sectioning commands. |
| getStartLineNumber() => null; |
| } |
| |
| class HashMarkerEvent extends HashEvent { |
| // Line number of first line in block that gets hashed. |
| var startLineNumber; |
| |
| // Highest possible number of first line after block that gets |
| // hashed (where the next \LMHash{} occurs). Note that this value |
| // is not known initially (because that line has not yet been |
| // reached), so [endLineNumber] will be initialized in a separate |
| // scan. Also note that the block may end earlier, because a block |
| // ends if it would otherwise include a sectioning command. |
| var endLineNumber; |
| |
| HashMarkerEvent(this.startLineNumber); |
| |
| setEndLineNumber(n) { |
| endLineNumber = n; |
| } |
| |
| getStartLineNumber() => startLineNumber; |
| } |
| |
| class HashLabelEvent extends HashEvent { |
| var labelText; |
| HashLabelEvent(this.labelText); |
| } |
| |
| class HashAnalyzer { |
| // List of kinds of pending (= most recently seen) sectioning command. |
| // When updating this list, also update sectioningPrefix below. |
| static const PENDING_IS_NONE = 0; |
| static const PENDING_IS_SECTION = 1; |
| static const PENDING_IS_SUBSECTION = 2; |
| static const PENDING_IS_SUBSUBSECTION = 3; |
| static const PENDING_IS_PARAGRAPH = 1; |
| |
| var lineNumber = 0; |
| var pendingSectioning = PENDING_IS_NONE; |
| |
| HashAnalyzer(); |
| |
| setPendingToSection() { |
| pendingSectioning = PENDING_IS_SECTION; |
| } |
| |
| setPendingToSubsection() { |
| pendingSectioning = PENDING_IS_SUBSECTION; |
| } |
| |
| setPendingToSubsubsection() { |
| pendingSectioning = PENDING_IS_SUBSUBSECTION; |
| } |
| |
| setPendingToParagraph() { |
| pendingSectioning = PENDING_IS_PARAGRAPH; |
| } |
| |
| clearPending() { |
| pendingSectioning = PENDING_IS_NONE; |
| } |
| |
| sectioningPrefix() { |
| switch (pendingSectioning) { |
| case PENDING_IS_SECTION: |
| return "sec:"; |
| case PENDING_IS_SUBSECTION: |
| return "subsec:"; |
| case PENDING_IS_SUBSUBSECTION: |
| return "subsubsec:"; |
| case PENDING_IS_PARAGRAPH: |
| return "par:"; |
| case PENDING_IS_NONE: |
| throw "\\LMHash{..} should only be used after a sectioning command " + |
| "(\\section, \\subsection, \\subsubsection, \\paragraph)"; |
| default: |
| // set of PENDING_IS_.. was extended, but updates here omitted |
| throw "Bug, please report to eernst@"; |
| } |
| } |
| |
| analyze(line) { |
| var currentLineNumber = lineNumber++; |
| if (isHashMarker(line)) { |
| return new HashMarkerEvent(currentLineNumber); |
| } else if (isHashLabel(line)) { |
| var labelText = sectioningPrefix() + extractHashLabel(line); |
| return new HashLabelEvent(labelText); |
| } else { |
| // No events to emit, but we may need to note state changes |
| if (isSectionCommand(line)) { |
| setPendingToSection(); |
| } else if (isSubsectionCommand(line)) { |
| setPendingToSubsection(); |
| } else if (isSubsubsectionCommand(line)) { |
| setPendingToSubsubsection(); |
| } else if (isParagraphCommand(line)) { |
| setPendingToParagraph(); |
| } else { |
| // No state changes. |
| } |
| return null; |
| } |
| } |
| } |
| |
| findHashEvents(lines) { |
| // Create the list of events, omitting endLineNumbers. |
| var events = findEvents(lines, new HashAnalyzer()); |
| // Set the endLineNumbers. |
| var currentEndLineNumber = lines.length; |
| for (var event in events.reversed) { |
| event.setEndLineNumber(currentEndLineNumber); |
| var nextEndLineNumber = event.getStartLineNumber(); |
| if (nextEndLineNumber != null) currentEndLineNumber = nextEndLineNumber; |
| } |
| return events; |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Removal of non-normative elements of the text (rationale, commentary). |
| |
| /// Returns [line] without the command [cmdName] (based on a match |
| /// on "\\cmdName\s*{..}") starting at [startIndex]; note that it is |
| /// assumed but not checked that [line] contains "\\cmdType\s*{..", |
| /// and note that the end of the {..} block is found via brace matching |
| /// (i.e., nested {..} blocks are handled), but it may break if '{' is |
| /// made an active character etc.etc. |
| removeCommand(line, cmdName, startIndex) { |
| const BACKSLASH = 92; // char code for '\\'. |
| const BRACE_BEGIN = 123; // char code for '{'. |
| const BRACE_END = 125; // char code for '}'. |
| |
| var blockStartIndex = startIndex + cmdName.length + 1; |
| while (blockStartIndex < line.length && |
| line.codeUnitAt(blockStartIndex) != BRACE_BEGIN) { |
| blockStartIndex++; |
| } |
| blockStartIndex++; |
| if (blockStartIndex > line.length) { |
| throw "Bug, please report to eernst@"; |
| } |
| // [blockStartIndex] has index just after '{'. |
| |
| var afterEscape = false; // Is true iff [index] is just after '{'. |
| var braceLevel = 1; // Have seen so many '{'s minus so many '}'s. |
| |
| for (var index = blockStartIndex; index < line.length; index++) { |
| switch (line.codeUnitAt(index)) { |
| case BRACE_BEGIN: |
| if (afterEscape) { |
| afterEscape = false; |
| } else { |
| braceLevel++; |
| } |
| break; |
| case BRACE_END: |
| if (afterEscape) { |
| afterEscape = false; |
| } else { |
| braceLevel--; |
| } |
| break; |
| case BACKSLASH: |
| afterEscape = true; |
| break; |
| default: |
| afterEscape = false; |
| } |
| if (braceLevel == 0) { |
| return line.substring(0, startIndex) + line.substring(index + 1); |
| } |
| } |
| // Removal failed; we consider this to mean that the input is ill-formed. |
| throw "Unmatched braces"; |
| } |
| |
| final commentaryRE = new RegExp(r"\\commentary\s*\{"); |
| final rationaleRE = new RegExp(r"\\rationale\s*\{"); |
| |
| /// Removes {}-balanced '\commentary{..}' commands from [line]. |
| removeCommentary(line) { |
| var match = commentaryRE.firstMatch(line); |
| if (match == null) return line; |
| return removeCommentary(removeCommand(line, r"commentary", match.start)); |
| } |
| |
| /// Removes {}-balanced '\rationale{..}' commands from [line]. |
| removeRationale(line) { |
| var match = rationaleRE.firstMatch(line); |
| if (match == null) return line; |
| return removeRationale(removeCommand(line, r"rationale", match.start)); |
| } |
| |
| /// Removes {}-balanced '\commentary{..}' and '\rationale{..}' |
| /// commands from [line], then normalizes its white-space. |
| simplifyLine(line) { |
| var simplerLine = removeCommentary(line); |
| simplerLine = removeRationale(simplerLine); |
| simplerLine = normalizeWhitespace(simplerLine); |
| return simplerLine; |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Recognition of line blocks, insertion of block hash into \LMHash{}. |
| |
| final latexArgumentRE = new RegExp(r"\{.*\}"); |
| |
| cleanupLine(line) => cutRegexp(line, commentRE, startOffset: 1).trimRight(); |
| |
| /// Returns concatenation of all lines from [startIndex] in [lines] until |
| /// a hash block terminator is encountered or [nextIndex] reached (if so, |
| /// the line lines[nextIndex] itself is not included); each line is cleaned |
| /// up using [cleanupLine], and " " is inserted between the lines gathered. |
| gatherLines(lines, startIndex, nextIndex) => lines |
| .getRange(startIndex, nextIndex) |
| .takeWhile(isntHashBlockTerminator) |
| .map(cleanupLine) |
| .join(" "); |
| |
| /// Computes the hash value for the line block starting at [startIndex] |
| /// in [lines], stopping just before [nextIndex]. SIDE EFFECT: |
| /// Outputs the simplified text and its hash value to [listSink]. |
| computeHashValue(lines, startIndex, nextIndex, listSink) { |
| final gatheredLine = gatherLines(lines, startIndex, nextIndex); |
| final simplifiedLine = simplifyLine(gatheredLine); |
| listSink.write(" % $simplifiedLine\n"); |
| var digest = sha1.convert(utf8.encode(simplifiedLine)); |
| return digest.bytes; |
| } |
| |
| computeHashString(lines, startIndex, nextIndex, listSink) => |
| hex.encode(computeHashValue(lines, startIndex, nextIndex, listSink)); |
| |
| /// Computes and adds hashes to \LMHash{} lines in [lines] (which |
| /// must be on the line numbers specified in [hashEvents]), and emits |
| /// sectioning markers and hash values to [listSink], along with |
| /// "comments" containing the simplified text (using the format |
| /// ' % <text>', where the text is one, long line, for easy grepping |
| /// etc.). |
| addHashMarks(lines, hashEvents, listSink) { |
| for (var hashEvent in hashEvents) { |
| if (hashEvent is HashMarkerEvent) { |
| var start = hashEvent.startLineNumber; |
| var end = hashEvent.endLineNumber; |
| final hashValue = computeHashString(lines, start + 1, end, listSink); |
| lines[start] = |
| lines[start].replaceAll(latexArgumentRE, "{" + hashValue + "}"); |
| listSink.write(" $hashValue\n"); |
| } else if (hashEvent is HashLabelEvent) { |
| listSink.write("${hashEvent.labelText}\n"); |
| } |
| } |
| } |
| |
| /// Transforms LaTeX input to LaTeX output plus hash value list file. |
| main([args]) { |
| if (args.length != 3) { |
| print("Usage: addlatexhash.dart <input-file> <output-file> <list-file>"); |
| throw "Received ${args.length} arguments, expected three"; |
| } |
| |
| // Get LaTeX source. |
| var inputFile = new File(args[0]); |
| assert(inputFile.existsSync()); |
| var lines = inputFile.readAsLinesSync(); |
| |
| // Will hold LaTeX source with normalized spacing etc., plus hash values. |
| var outputFile = new File(args[1]); |
| |
| // Will hold hierarchical list of hash values. |
| var listFile = new File(args[2]); |
| var listSink = listFile.openWrite(); |
| |
| // Perform single-line normalization. |
| var inDartCode = false; |
| var normalizedLines = new List(); |
| |
| for (var line in lines) { |
| if (sispIsDartBegin(line)) { |
| inDartCode = true; |
| } else if (sispIsDartEnd(line)) { |
| inDartCode = false; |
| } |
| if (inDartCode) { |
| normalizedLines.add(sispNormalize(line + "\n")); |
| } else { |
| normalizedLines.add(normalize(line + "\n")); |
| } |
| } |
| |
| // Perform multi-line normalization. |
| normalizedLines = multilineNormalize(normalizedLines); |
| |
| // Insert hash values. |
| var hashEvents = findHashEvents(normalizedLines); |
| addHashMarks(normalizedLines, hashEvents, listSink); |
| |
| // Produce/finalize output. |
| outputFile.writeAsStringSync(normalizedLines.join()); |
| listSink.close(); |
| } |