#!/bin/python3 """ Below is the program with my corrections. We assume in general that each line contains only one word, and further do not consider whether that word has any punctuation at the end. For example, 'the...' will be considered a different word from 'the' and so on. This could easily be remedied by removing all instances of punctuation like '.' and ',' by doing something like: word = line.lower().replace('.', '').replace(',', '').replace(...) However, I do not do this for simplicity. I've compared the output to the following commmand: cat test4-part1-input.txt | tr ' ' '\n' | sort | tr '[:upper:]' '[:lower:]' | uniq -ic | sort -nr They seem to agree almost perfectly. The command above, however, seems to sometimes count a word multiple times, so there will be entries like 15 its ... 1 its Whereas my code will give only 16 its I do not know why this is, but it seems unimportant--this means my code works even better! """ import re import sys (filename, threshold) = sys.argv[1:3] # Validate arguments if (re.match("\D", threshold)): print("The threshold must be a number.") sys.exit(1) threshold = int(threshold) # NOTE: because this comes from argv, this is originally a str # Read file and tally word frequencies fh = open(filename) file = fh.read() words = [] for line in file.split('\n'): if not line: # NOTE: we should skip any empty lines continue # found = 0 # NOTE: switched to booleans because of personal preference found = False # for word in words: # if word[0] == line.lower(): # found = 1 # word[1] += 1 for pair_i, (word, _) in enumerate(words): if word == line.lower(): found = True words[pair_i][1] += 1 break # NOTE: don't need to check the other words # initialize a new word with a freq of 1 # if found == 0: # words.append([line, 1]) # NOTE: this would make 'Dog' and 'dog' to be different words if not found: # NOTE: style points words.append([line.lower(), 1]) # Print words and their frequencies, sorted alphabetically by word. Only # print a word if its frequency is greater than or equal to the threshold. # for word in sorted(words): # NOTE: pretty sure this won't sort a list of lists automatically # if word[0] < threshold: continue # NOTE: this is the word itself, not the freq! # print("%4d %s" % (word[1], word[0])) # NOTE: can use an f-string for style points for word, freq in sorted(words, key=lambda wf_pair: wf_pair[1], reverse=True): if freq > threshold: print(f"{freq:>7} {word}")