With content creation and blogging one of the good businesses in the market everyone wants to try their hands on this but some lack sufficient funds to give their articles a free plagiarism check as mostly plagiarism checkers do not come for free. Building a Python plagiarism checker could be built here using a natural language processing library along with the search API to search the first few pages of Google and detect plagiarism if any.
SOURCE CODE
""" Plagiarism Checker
----------------------------------------
"""
import click
from .matcher import Text, ExtendedMatch, Matcher
import os
import glob
import csv
import logging
import itertools
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def getFiles(path):
"""
Determines whether a path is a file or directory.
If it's a directory, it gets a list of all the text files
in that directory, recursively. If not, it gets the file.
"""
if os.path.isfile(path):
return [path]
elif os.path.isdir(path):
// Get list of all files in dir, recursively.
return glob.glob(path + "/**/*.txt", recursive=True)
else:
raise click.ClickException("The path %s doesn't appear to be a file or directory" % path)
def checkLog(logfile, textpair):
"""
Checks the log file to make sure we haven't already done a particular analysis.
Returns True if the pair is in the log already.
"""
pairs = []
logging.debug('Looking in the log for textpair:' % textpair)
if not os.path.isfile(logfile):
logging.debug('No log file found.')
return None
with open(logfile, newline='') as f:
reader = csv.reader(f)
for row in reader:
pairs.append([row[0], row[1]])
// logging.debug('Pairs already in log: %s' % pairs)
return textpair in pairs
def createLog(logfile, columnLabels):
"""
Creates a log file and sets up headers so that it can be easily read
as a CSV later.
"""
header = ','.join(columnLabels) + '\n'
with open(logfile, 'w') as f:
f.write(header)
f.close
@click.command()
@click.argument('text1')
@click.argument('text2')
@click.option('-t', '--threshold', type=int, default=3, \
help='The shortest length of match to include in the list of initial matches.')
@click.option('-c', '--cutoff', type=int, default=5, \
help='The shortest length of match to include in the final list of extended matches.')
@click.option('-n', '--ngrams', type=int, default=3, \
help='The ngram n-value to match against.')
@click.option('-l', '--logfile', default='log.txt', help='The name of the log file to write to.'
@click.option('--stops', is_flag=True, help='Include stopwords in matching.', default=False)
@click.option('--verbose', is_flag=True, help='Enable verbose mode, giving more information.')
def cli(text1, text2, threshold, cutoff, ngrams, logfile, verbose, stops):
""" This program finds similar text in two text files. """
//Determine whether the given path is a file or directory.
texts1 = getFiles(text1)
texts2 = getFiles(text2)
if verbose:
logging.basicConfig(level=logging.DEBUG)
if stops:
logging.debug('Including stopwords in tokenizing.')
logging.debug('Comparing this/these text(s): %s' % str(texts1))
logging.debug('with this/these text(s): %s' % str(texts2))
pairs = list(itertools.product(texts1, texts2))
numPairs = len(pairs)
logging.debug('Comparing %s pairs.' % numPairs)
// logging.debug('List of pairs to compare: %s' % pairs)
logging.debug('Loading files into memory.')
texts = {}
prevTextObjs = {}
for filename in texts1+texts2:
with open(filename, errors="ignore") as f:
text = f.read()
if filename not in texts:
texts[filename] = text
logging.debug('Loading complete.')
for index, pair in enumerate(pairs):
timeStart = os.times().elapsed
logging.debug('Now comparing pair %s of %s.' % (index+1, numPairs))
logging.debug('Comparing %s with %s.' % (pair[0], pair[1]))
// Make sure we haven't already done this pair.
inLog = checkLog(logfile, [pair[0], pair[1]])
if inLog is None:
// This means that there isn't a log file. Let's set one up.
// Set up columns and their labels.
columnLabels = ['Text A', 'Text B', 'Threshold', 'Cutoff', 'N-Grams', 'Num Matches', 'Text A Length', 'Text B Length', 'Locations in A', 'Locations in B']
logging.debug('No log file found. Setting one up.')
createLog(logfile, columnLabels)
if inLog:
logging.debug('This pair is already in the log. Skipping.')
continue
logging.debug('Processing texts.')
filenameA, filenameB = pair[0], pair[1]
textA, textB = texts[filenameA], texts[filenameB]
// Put this in a dictionary so we don't have to process a file twice.
for filename in [filenameA, filenameB]:
if filename not in prevTextObjs:
logging.debug('Processing text: %s' % filename)
prevTextObjs[filename] = Text(texts[filename], filename)
// Just more convenient naming.
textObjA = prevTextObjs[filenameA]
textObjB = prevTextObjs[filenameB]
// Reset the table of previous text objects, so we don't overload memory.
// This means we'll only remember the previous two texts.
prevTextObjs =
// Do the matching.
myMatch = Matcher(textObjA, textObjB, threshold=threshold, cutoff=cutoff, ngramSize=ngrams, removeStopwords=stops)
myMatch.match()
timeEnd = os.times().elapsed
timeElapsed = timeEnd-timeStart
logging.debug('Matching completed in %s seconds.' % timeElapsed)
// Write to the log, but only if a match is found.
if myMatch.numMatches > 0:
logItems = [pair[0], pair[1], threshold, cutoff, ngrams, myMatch.numMatches, myMatch.textA.length, myMatch.textB.length, str(myMatch.locationsA), str(myMatch.locationsB)]
logging.debug('Logging items: %s' % str(logItems))
line = ','.join(['"%s"' % item for item in logItems]) + '\n'
f = open(logfile, 'a')
f.write(line)
f.close()
if __name__ == '__main__':
cli()
0 Comments