Plagiarism Checker Python Project [with source code | report | presentation and documentation] - Whoopee Webs

 


With content creation and blogging one of the good businesses in the market everyone wants to try their hands on this but some lack sufficient funds to give their articles a free plagiarism check as mostly plagiarism checkers do not come for free. Building a Python plagiarism checker could be built here using a natural language processing library along with the search API to search the first few pages of Google and detect plagiarism if any.


SOURCE CODE

""" Plagiarism Checker
----------------------------------------
"""
import click
from .matcher import Text, ExtendedMatch, Matcher
import os
import glob
import csv
import logging
import itertools
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def getFiles(path): 
    """ 
    Determines whether a path is a file or directory. 
    If it's a directory, it gets a list of all the text files 
    in that directory, recursively. If not, it gets the file. 
    """
    if os.path.isfile(path): 
        return [path]
    elif os.path.isdir(path): 
        // Get list of all files in dir, recursively. 
        return glob.glob(path + "/**/*.txt", recursive=True)
    else: 
        raise click.ClickException("The path %s doesn't appear to be a file or directory" % path) 
def checkLog(logfile, textpair): 
    """ 
    Checks the log file to make sure we haven't already done a particular analysis. 
    Returns True if the pair is in the log already. 
    """
    pairs = []
    logging.debug('Looking in the log for textpair:' % textpair)
    if not os.path.isfile(logfile): 
        logging.debug('No log file found.')
        return None
    with open(logfile, newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            pairs.append([row[0], row[1]])
    // logging.debug('Pairs already in log: %s' % pairs)
    return textpair in pairs
def createLog(logfile, columnLabels): 
    """ 
    Creates a log file and sets up headers so that it can be easily read 
    as a CSV later. 
    """
    header = ','.join(columnLabels) + '\n'
    with open(logfile, 'w') as f: 
        f.write(header) 
        f.close
@click.command()
@click.argument('text1')
@click.argument('text2') 
@click.option('-t', '--threshold', type=int, default=3, \
        help='The shortest length of match to include in the list of initial matches.')
@click.option('-c', '--cutoff', type=int, default=5, \
        help='The shortest length of match to include in the final list of extended matches.')
@click.option('-n', '--ngrams', type=int, default=3, \
        help='The ngram n-value to match against.')
@click.option('-l', '--logfile', default='log.txt', help='The name of the log file to write to.'
@click.option('--stops', is_flag=True, help='Include stopwords in matching.', default=False)
@click.option('--verbose', is_flag=True, help='Enable verbose mode, giving more information.')
def cli(text1, text2, threshold, cutoff, ngrams, logfile, verbose, stops):
    """ This program finds similar text in two text files. """
    //Determine whether the given path is a file or directory. 
    texts1 = getFiles(text1)
    texts2 = getFiles(text2) 
    if verbose: 
        logging.basicConfig(level=logging.DEBUG)
    if stops: 
        logging.debug('Including stopwords in tokenizing.') 
    logging.debug('Comparing this/these text(s): %s' % str(texts1))
    logging.debug('with this/these text(s): %s' % str(texts2))
    pairs = list(itertools.product(texts1, texts2))
    numPairs = len(pairs) 
    logging.debug('Comparing %s pairs.' % numPairs)
    // logging.debug('List of pairs to compare: %s' % pairs)
    logging.debug('Loading files into memory.')
    texts = {}
    prevTextObjs = {}
    for filename in texts1+texts2: 
        with open(filename, errors="ignore") as f: 
            text = f.read() 
        if filename not in texts: 
            texts[filename] = text
    logging.debug('Loading complete.')
    for index, pair in enumerate(pairs): 
        timeStart = os.times().elapsed
        logging.debug('Now comparing pair %s of %s.' % (index+1, numPairs))
        logging.debug('Comparing %s with %s.' % (pair[0], pair[1]))
        // Make sure we haven't already done this pair. 
        inLog = checkLog(logfile, [pair[0], pair[1]])
        if inLog is None: 
            // This means that there isn't a log file. Let's set one up.
            // Set up columns and their labels. 
            columnLabels = ['Text A', 'Text B', 'Threshold', 'Cutoff', 'N-Grams', 'Num Matches', 'Text A Length', 'Text B Length', 'Locations in A', 'Locations in B']
            logging.debug('No log file found. Setting one up.')
            createLog(logfile, columnLabels)
        if inLog: 
            logging.debug('This pair is already in the log. Skipping.')
            continue
        logging.debug('Processing texts.')
        filenameA, filenameB = pair[0], pair[1]
        textA, textB = texts[filenameA], texts[filenameB]
        // Put this in a dictionary so we don't have to process a file twice.
        for filename in [filenameA, filenameB]: 
            if filename not in prevTextObjs: 
                logging.debug('Processing text: %s' % filename)
                prevTextObjs[filename] = Text(texts[filename], filename)
        // Just more convenient naming. 
        textObjA = prevTextObjs[filenameA]
        textObjB = prevTextObjs[filenameB]
        // Reset the table of previous text objects, so we don't overload memory. 
        // This means we'll only remember the previous two texts. 
        prevTextObjs = 
        // Do the matching. 
        myMatch = Matcher(textObjA, textObjB, threshold=threshold, cutoff=cutoff, ngramSize=ngrams, removeStopwords=stops)
        myMatch.match()
        timeEnd = os.times().elapsed
        timeElapsed = timeEnd-timeStart
        logging.debug('Matching completed in %s seconds.' % timeElapsed)
        // Write to the log, but only if a match is found.
        if myMatch.numMatches > 0: 
            logItems = [pair[0], pair[1], threshold, cutoff, ngrams, myMatch.numMatches, myMatch.textA.length, myMatch.textB.length, str(myMatch.locationsA), str(myMatch.locationsB)]
            logging.debug('Logging items: %s' % str(logItems))
            line = ','.join(['"%s"' % item for item in logItems]) + '\n'
            f = open(logfile, 'a')
            f.write(line)
            f.close()
if __name__ == '__main__':
    cli()

Post a Comment

0 Comments