#!/bin/python3

"""
Below is the program with my corrections. We assume in general that each line contains only one word, and 
further do not consider whether that word has any punctuation at the end. For example, 'the...' will be 
considered a different word from 'the' and so on. This could easily be remedied by removing all instances 
of punctuation like '.' and ',' by doing something like:

    word = line.lower().replace('.', '').replace(',', '').replace(...)

However, I do not do this for simplicity. I've compared the output to the following commmand:

    cat test4-part1-input.txt | tr ' ' '\n' | sort | tr '[:upper:]' '[:lower:]' | uniq -ic | sort -nr

They seem to agree almost perfectly. The command above, however, seems to sometimes count a word multiple 
times, so there will be entries like

    15 its
    ...
     1 its

Whereas my code will give only

    16 its

I do not know why this is, but it seems unimportant--this means my code works even better!
"""

import re
import sys

(filename, threshold) = sys.argv[1:3]

# Validate arguments
if (re.match("\D", threshold)):
    print("The threshold must be a number.")
    sys.exit(1)

threshold = int(threshold)                  # NOTE: because this comes from argv, this is originally a str

# Read file and tally word frequencies
fh = open(filename)
file = fh.read()
words = []
for line in file.split('\n'):
    if not line:                            # NOTE: we should skip any empty lines
        continue
    # found = 0                             # NOTE: switched to booleans because of personal preference
    found = False
    # for word in words:
    #     if word[0] == line.lower():
    #         found = 1
    #         word[1] += 1
    for pair_i, (word, _) in enumerate(words):
        if word == line.lower():
            found = True
            words[pair_i][1] += 1
            break                           # NOTE: don't need to check the other words
 
    # initialize a new word with a freq of 1
    # if found == 0:
    #     words.append([line, 1])           # NOTE: this would make 'Dog' and 'dog' to be different words
    if not found:                           # NOTE: style points
        words.append([line.lower(), 1])

# Print words and their frequencies, sorted alphabetically by word.  Only
# print a word if its frequency is greater than or equal to the threshold.
# for word in sorted(words):                # NOTE: pretty sure this won't sort a list of lists automatically
#     if word[0] < threshold: continue      # NOTE: this is the word itself, not the freq!
#     print("%4d %s" % (word[1], word[0]))  # NOTE: can use an f-string for style points
for word, freq in sorted(words, key=lambda wf_pair: wf_pair[1], reverse=True):
    if freq > threshold:
        print(f"{freq:>7} {word}")