utils

convert arxiv id to arxiv link

Parameters:
  • arxivid (str) –

    arxiv id

Returns:
  • link (str)

Source code in src/racp/utils.py
def arxivid2link(arxivid):
    """convert arxiv id to arxiv link 

    Args:
        arxivid (str): arxiv id 

    Returns:
        link (str)
    """
    return f'<a href="https://arxiv.org/abs/{arxivid}">{arxivid}</a>'

ccbc(paperA, paperB)

Calculate citation similarity index.

CCBC stands for co-citation and bib coupling index plus direct citation as well

Parameters:
  • paperA

    PaperItem

  • paperB

    PaperItem

Returns:
  • index

    [0,1]

Source code in src/racp/utils.py
def ccbc(paperA,paperB):
    """Calculate citation similarity index. 

    CCBC stands for co-citation and bib coupling index 
    plus direct citation as well 

    Args:
        paperA: PaperItem 
        paperB: PaperItem

    Returns:
        index: [0,1]
    """
    score = 0 
    # 1. direct citation relationship 
    if paperA.ss_id in paperB.citations or \
        paperB.ss_id in paperA.citations:
            score += 0.5 
    # 2. shared citation ratio 
    cocite = paperA.citations.intersection(paperB.citations)
    alcite = paperA.citations.union(paperB.citations)
    if alcite:
        score += len(cocite) / len(alcite)
    # 3. shared reference ratio 
    coref = paperA.references.intersection(paperB.references)
    alref = paperA.references.union(paperB.references)
    if alref:
        score += len(coref) / len(alref)

    return score / 2.5

load_config(config_path)

Load configuration from a YAML file.

Parameters:
  • config_path (str) –

    path to the YAML configuration file.

Returns:
  • ConfigObject( ConfigObject ) –

    a ConfigObject instance containing the configuration data.

Raises:
  • FileNotFoundError

    if the configuration file is not found.

  • YAMLError

    if the configuration file is not valid YAML.

Source code in src/racp/utils.py
def load_config(config_path: str) -> ConfigObject:
    """Load configuration from a YAML file.

    Args:
        config_path (str): path to the YAML configuration file.

    Returns:
        ConfigObject: a ConfigObject instance containing the configuration data.

    Raises:
        FileNotFoundError: if the configuration file is not found.
        yaml.YAMLError: if the configuration file is not valid YAML.
    """
    with open(config_path, 'r') as config_file:
        config_data = yaml.safe_load(config_file)

    if config_data is not None:
        return ConfigObject(config_data)
    else:
        raise ValueError("Failed to load configuration data")

makedir(path, logger)

A mkdir command that can recursively create directory and output log.

Parameters:
  • path (str) –

    The path of the directory you want to create.

  • logger

    loguru logger

Returns:
  • path

    The input path.

Source code in src/racp/utils.py
def makedir(
    path : str, 
    logger
):
    '''A mkdir command that can recursively create directory and output log.

    Args:
        path: The path of the directory you want to create.
        logger: loguru logger

    Returns:
        path: The input path.
    '''

    if not os.path.exists(path):
        os.makedirs(path)
        logger.debug(f"Create {path}")
    return path

parse_pdf(pdfpath=None, stream=None)

Parse pdf files into raw text.

Parameters:
  • pdfpath

    The path to pdf file.

  • stream

    If your pdf has already been read in binary mode, then use this arg.

Returns:
  • text

    Raw text.

Source code in src/racp/utils.py
def parse_pdf(
    pdfpath=None,
    stream=None
):
    '''Parse pdf files into raw text.

    Args:
        pdfpath: The path to pdf file.
        stream: If your pdf has already been read in binary mode, then use this arg.

    Returns:
        text: Raw text.
    '''
    import fitz
    if stream != None:
        pdf = fitz.open(stream=stream.read(), filetype="pdf")
    else:
        pdf = fitz.open(pdfpath)
    text = ""
    for page in pdf.pages():
        text += page.get_text()
    return text

powerlaw_fit(data, xmin=None, fit_method='Likelihood')

This function use powerlaw to fit the given data and returns powerlaw.Fit object.

Parameters:
  • data

    List or array.

  • xmin

    The data value beyond which distributions should be fitted.

  • fit_method

    "Likelihood" as default, "KS" optional.

Returns:
  • cdf

    A powerlaw.Fit object.

Source code in src/racp/utils.py
def powerlaw_fit(
    data,
    xmin=None,
    fit_method="Likelihood"
):
    '''This function use `powerlaw` to fit the given data and returns `powerlaw.Fit` object.

    Args:
        data: List or array.
        xmin: The data value beyond which distributions should be fitted. 
        fit_method: "Likelihood" as default, "KS" optional.

    Returns:
        cdf: A `powerlaw.Fit` object.
    '''
    import powerlaw
    fit = powerlaw.Fit(data, xmin=xmin, fit_method=fit_method)
    return fit

powerlaw_fit_cdf(data, xmin=None, fit_method='Likelihood')

This function use powerlaw to fit the given data and returns cdf.

Note that some values not appeared in the given data will not have a cdf value. For example, 100 may not exist in the data, but 99 exists. We use the cdf of 99 as a substitute of the cdf of 100 in the returned dict. In addition, the returned dict's keys don't start with 0. You can treat all the value less than the min of the keys as 0.

Parameters:
  • data

    List or array.

  • xmin

    The data value beyond which distributions should be fitted.

  • fit_method

    "Likelihood" as default, "KS" optional.

Returns:
  • cdf

    A dictionary formated as {x : cdf(x)}.

Source code in src/racp/utils.py
def powerlaw_fit_cdf(
    data,
    xmin=None,
    fit_method="Likelihood"
):
    '''This function use `powerlaw` to fit the given data and returns cdf.

    Note that some values not appeared in the given data will not have a cdf
    value. For example, 100 may not exist in the data, but 99 exists. We use
    the cdf of 99 as a substitute of the cdf of 100 in the returned dict. In 
    addition, the returned dict's keys don't start with 0. You can treat all
    the value less than the min of the keys as 0.

    Args:
        data: List or array.
        xmin: The data value beyond which distributions should be fitted. 
        fit_method: "Likelihood" as default, "KS" optional.

    Returns:
        cdf: A dictionary formated as {x : cdf(x)}.
    '''
    fit = powerlaw_fit(data, xmin, fit_method)
    cdf = fit.cdf()
    cdf = dict([(cdf[0][i], cdf[1][i]) for i in range(len(cdf[0]))])
    count = min(cdf.keys())
    extra_values = {}
    last_cdf = 0
    for k, v in cdf.items():
        if k != count + 1:
            for i in range(int(count)+1, int(k)):
                extra_values[i] = last_cdf
        last_cdf = v
        count = k
    cdf.update(extra_values)
    return cdf

save_json(obj, path, logger, name=None)

A helper function to save json files.

Parameters:
  • obj

    The object you want to save.

  • path (str) –

    The file path to save.

  • logger

    loguru logger.

  • name

    Default to None. Appear in log.

Source code in src/racp/utils.py
def save_json(
    obj, 
    path : str, 
    logger, 
    name=None
):
    '''A helper function to save json files.

    Args:
        obj: The object you want to save.
        path: The file path to save.
        logger: loguru logger.
        name: Default to None. Appear in log.
    '''

    logger.debug(f"Process {os.getpid()} : Saving {name} to {path}")
    with open(path, "w", encoding= "utf-8") as f:
        json.dump(obj, f, indent=4)

weighted_ccbc(paperA, paperB, weight=dict())

Calculate citation similarity index.

CCBC stands for co-citation and bib coupling index plus direct citation as well

Parameters:
  • paperA

    PaperItem

  • paperB

    PaperItem

  • weight

    A dict containing each paper's weight, default to 1.

Returns:
  • index

    [0,1]

Source code in src/racp/utils.py
def weighted_ccbc(paperA, paperB, weight=dict()):
    """Calculate citation similarity index. 

    CCBC stands for co-citation and bib coupling index 
    plus direct citation as well 

    Args:
        paperA: PaperItem 
        paperB: PaperItem
        weight: A dict containing each paper's weight, default to 1.

    Returns:
        index: [0,1]
    """
    score = 0 
    # 1. direct citation relationship 
    if paperA.ss_id in paperB.citations:
        score += weight.get(paperA.ss_id, 1)/6
    if paperB.ss_id in paperA.citations:
        score += weight.get(paperB.ss_id, 1)/6
    # 2. shared citation ratio 
    cocite = paperA.citations.intersection(paperB.citations)
    alcite = paperA.citations.union(paperB.citations)
    if len(cocite) != 0:
        score += weight.get(paperA.ss_id, 1) * weight.get(paperB.ss_id, 1) * \
                    len(cocite) / len(alcite) / 3
    # 3. shared reference ratio 
    coref = paperA.references.intersection(paperB.references)
    alref = paperA.references.union(paperB.references)
    if len(coref) != 0:
        score += sum([weight.get(ss_id, 1) for ss_id in coref]) / \
                    sum([weight.get(ss_id, 1) for ss_id in alref]) / 3

    return score